(***********************************************
 *                                             *
 *              CS520 Fall 2006                *
 *           Programming Assignment            *
 *         Computer Science Department         *
 *             Boston University               *
 *                                             *
 ***********************************************)

(*
 * The lexical analyzer which converts a character
 * stream to tokens.
 *)

{
open Support.Error

let reservedWords = [
  (* Keywords *)
  ("lam", fun i -> Parser.LAMBDA i);
  ("else", fun i -> Parser.ELSE i);
  ("fix", fun i -> Parser.FIX i);
  ("if", fun i -> Parser.IF i);
  ("in", fun i -> Parser.IN i);
  ("let", fun i -> Parser.LET i);
  ("letrec", fun i -> Parser.LETREC i);
  ("mod", fun i -> Parser.MOD i);
  ("print", fun i -> Parser.PRINT i);
  ("ref", fun i -> Parser.REF i);
  ("then", fun i -> Parser.THEN i);
  ("unit", fun i -> Parser.UNIT i);
  ("true", fun i -> Parser.TRUE i);
  ("false", fun i -> Parser.FALSE i);
  ("raise", fun i -> Parser.RAISE i);
  ("try", fun i -> Parser.TRY i);
  ("with", fun i -> Parser.WITH i);
  
  (* Symbols *)
  ("(", fun i -> Parser.LPAREN i); 
  (")", fun i -> Parser.RPAREN i);
  (":", fun i -> Parser.COLON i);
  (",", fun i -> Parser.COMMA i);
  (".", fun i -> Parser.DOT i);
  ("=", fun i -> Parser.EQ i);
  ("|", fun i -> Parser.BAR i);

  ("~", fun i -> Parser.TILDE i);
  ("+", fun i -> Parser.PLUS i);
  ("-", fun i -> Parser.MINUS i);
  ("*", fun i -> Parser.STAR i);
  ("/", fun i -> Parser.SLASH i);
  ("!", fun i -> Parser.BANG i);

  ("<", fun i -> Parser.LT i);
  (">", fun i -> Parser.GT i);

  (* Special compound symbols: *)
  ("->", fun i -> Parser.ARROW i);
  ("=>", fun i -> Parser.DARROW i);
  ("<=", fun i -> Parser.LTEQ i);
  (">=", fun i -> Parser.GTEQ i);
  ("<>", fun i -> Parser.NEQ i);
  (":=", fun i -> Parser.ASSIGN i);
]

(* Support functions *)

type buildfun = info -> Parser.token

let (symbolTable : (string, buildfun) Hashtbl.t) = Hashtbl.create 1024

let _ =
  List.iter (fun (str,f) -> Hashtbl.add symbolTable str f) reservedWords

let createID i str =
  try (Hashtbl.find symbolTable str) i
  with _ -> Parser.VAR {i=i; v=str}

let lineno   = ref 1
and depth    = ref 0
and start    = ref 0
and filename = ref ""
and startLex = ref dummyinfo

let create inFile stream =
  if not (Filename.is_implicit inFile) then filename := inFile
  else filename := Filename.concat (Sys.getcwd()) inFile;
  lineno := 1; start := 0; Lexing.from_channel stream

let newline lexbuf = incr lineno; start := (Lexing.lexeme_start lexbuf)

let info lexbuf =
  createInfo (!filename) (!lineno) (Lexing.lexeme_start lexbuf - !start)

let text = Lexing.lexeme

let stringBuffer = ref (String.create 2048)
let stringEnd = ref 0

let resetStr () = stringEnd := 0

let addStr ch =
  let x = !stringEnd in
  let buffer = !stringBuffer
in
  if x = String.length buffer then
    begin
      let newBuffer = String.create (x*2) in
      String.blit buffer 0 newBuffer 0 x;
      String.set newBuffer x ch;
      stringBuffer := newBuffer;
      stringEnd := x+1
    end
  else
    begin
      String.set buffer x ch;
      stringEnd := x+1
    end

let getStr () = String.sub (!stringBuffer) 0 (!stringEnd)
}

(* The main body of the lexical analyzer *)

rule main = parse
  [' ' '\009' '\012']+     
    { main lexbuf }

| [' ' '\009' '\012']*("\r")?"\n" 
    { newline lexbuf; main lexbuf }

| "*)" 
    { error (info lexbuf) "Unmatched end of comment" }

| "(*" 
    { depth := 1; startLex := info lexbuf; comment lexbuf; main lexbuf }

| ['0'-'9']+
    { Parser.INTV{i=info lexbuf; v=int_of_string (text lexbuf)} }

| ['A'-'Z' 'a'-'z' '_'] ['A'-'Z' 'a'-'z' '_' '0'-'9' '\'']*
    { createID (info lexbuf) (text lexbuf) }

| "=>" | "->" | ">=" | "<=" | "<>" | ":="
    { createID (info lexbuf) (text lexbuf) }

| ['(' ')' ':' ',' '.' '=' '+' '-' '*' '~' '>' '<' '/' '!' '|'] 
    { createID (info lexbuf) (text lexbuf) }

| "\"" { resetStr(); startLex := info lexbuf; string lexbuf }

| eof 
    { Parser.EOF }

| _  
    { error (info lexbuf) "Illegal character" }

and comment = parse
  "(*"
    { depth := succ !depth; comment lexbuf }
| "*)"
    { depth := pred !depth; if !depth > 0 then comment lexbuf }
| eof
    { error (!startLex) "Comment not terminated" }
| [^ '\n']
    { comment lexbuf }
| "\n"
    { newline lexbuf; comment lexbuf }

and string = parse
  '"'  { Parser.STRINGV {i = !startLex; v=getStr()} }
| '\\' { addStr(escaped lexbuf); string lexbuf }
| '\n' { addStr '\n'; newline lexbuf; string lexbuf }
| eof  { error (!startLex) "String not terminated" }
| _    { addStr (Lexing.lexeme_char lexbuf 0); string lexbuf }

and escaped = parse
  'n'	 { '\n' }
| 't'	 { '\t' }
| '\\'	 { '\\' }
| '"'    { '\034'  }
| '\''	 { '\'' }
| ['0'-'9']['0'-'9']['0'-'9']
    {
      let x = int_of_string(text lexbuf) in
      if x > 255 then
	error (info lexbuf) "Illegal character constant"
      else
	Char.chr x
    }
| [^ '"' '\\' 't' 'n' '\'']
    { error (info lexbuf) "Illegal character constant" }

