CS153/hw4/bin/lexer.mll

{
  open Lexing
  open Parser
  module Range = Util.Range
  open Range

  exception Lexer_error of Range.t * string

  let reset_lexbuf (filename:string) (lnum:int) lexbuf : unit =
    lexbuf.lex_curr_p <- {
      pos_fname = filename;
      pos_cnum = 0;
      pos_bol = 0;
      pos_lnum = lnum;
    }

  let newline lexbuf =
    lexbuf.lex_curr_p <- { (lexeme_end_p lexbuf) with
      pos_lnum = (lexeme_end_p lexbuf).pos_lnum + 1;
      pos_bol = (lexeme_end lexbuf) }

  (* Boilerplate to define exceptional cases in the lexer. *)
  let unexpected_char lexbuf (c:char) : 'a =
    raise (Lexer_error (Range.lex_range lexbuf,
        Printf.sprintf "Unexpected character: '%c'" c))

  (* Lexing reserved words *)
  let reserved_words = [
  (* Keywords *)
  ("null", NULL);
  ("void", TVOID);
  ("int", TINT);
  ("string", TSTRING);
  ("else", ELSE);
  ("if", IF);
  ("while", WHILE);
  ("return", RETURN);
  ("var", VAR);
  ("global", GLOBAL);

  (* Symbols *)
  ( ";", SEMI);
  ( ",", COMMA);
  ( "{", LBRACE);
  ( "}", RBRACE);
  ( "+", PLUS);
  ( "-", DASH);
  ( "*", STAR);
  ( "=", EQ);
  ( "==", EQEQ);
  ( "!", BANG);
  ( "~", TILDE);
  ( "(", LPAREN);
  ( ")", RPAREN);
  ( "[", LBRACKET);
  ( "]", RBRACKET);

  ]

let (symbol_table : (string, Parser.token) Hashtbl.t) = Hashtbl.create 1024
  let _ =
    List.iter (fun (str,t) -> Hashtbl.add symbol_table str t) reserved_words

  let create_token lexbuf =
    let str = lexeme lexbuf in
    try (Hashtbl.find symbol_table str)
    with _ -> IDENT str

  (* Lexing comments and strings *)
  let string_buffer = ref (Bytes.create 2048)
  let string_end = ref 0
  let start_lex = ref (Range.start_of_range Range.norange)

  let start_pos_of_lexbuf lexbuf : pos =
    (Range.pos_of_lexpos (lexeme_start_p lexbuf))

  let lex_long_range lexbuf : Range.t =
    let end_p = lexeme_end_p lexbuf in
    mk_range end_p.pos_fname (!start_lex) (pos_of_lexpos end_p)

  let reset_str () = string_end := 0

  let add_str ch =
    let x = !string_end in
    let buffer = !string_buffer
    in
      if x = Bytes.length buffer then
        begin
          let new_buffer = Bytes.create (x*2) in
          Bytes.blit buffer 0 new_buffer 0 x;
          Bytes.set new_buffer x ch;
          string_buffer := new_buffer;
          string_end := x+1
        end
      else
        begin
          Bytes.set buffer x ch;
          string_end := x+1
        end

  let get_str () = Bytes.sub_string (!string_buffer) 0 (!string_end)

  (* Lexing directives *)
  let lnum = ref 1
}

(* Declare your aliases (let foo = regex) and rules here. *)
let newline = '\n' | ('\r' '\n') | '\r'
let lowercase = ['a'-'z']
let uppercase = ['A'-'Z']
let character = uppercase | lowercase
let whitespace = ['\t' ' ']
let digit = ['0'-'9']
let hexdigit = ['0'-'9'] | ['a'-'f'] | ['A'-'F']

rule token = parse
  | eof { EOF }

  | "/*" { start_lex := start_pos_of_lexbuf lexbuf; comments 0 lexbuf }
  | '"' { reset_str(); start_lex := start_pos_of_lexbuf lexbuf; string false lexbuf }
  | '#' { let p = lexeme_start_p lexbuf in
          if p.pos_cnum - p.pos_bol = 0 then directive 0 lexbuf
          else raise (Lexer_error (lex_long_range lexbuf,
            Printf.sprintf "# can only be the 1st char in a line.")) }

  | lowercase (digit | character | '_')* { create_token lexbuf }
  | digit+ | "0x" hexdigit+ { INT (Int64.of_string (lexeme lexbuf)) }
  | whitespace+ { token lexbuf }
  | newline { newline lexbuf; token lexbuf }

  | ';' | ',' | '{' | '}' | '+' | '-' | '*' | '=' | "=="
  | "!=" | '!' | '~' | '(' | ')' | '[' | ']'
    { create_token lexbuf }

  | _ as c { unexpected_char lexbuf c }

and directive state = parse
  | whitespace+ { directive state lexbuf }
  | digit+ { if state = 0 then
               (lnum := int_of_string (lexeme lexbuf);
                directive 1 lexbuf)
             else if state = 2 then directive 3 lexbuf
             else raise (Lexer_error (lex_long_range lexbuf,
               Printf.sprintf "Illegal directives")) }
  | '"' { if state = 1 then
            begin
              reset_str();
              start_lex := start_pos_of_lexbuf lexbuf;
              string true lexbuf
            end
          else raise (Lexer_error (lex_long_range lexbuf,
            Printf.sprintf "Illegal directives"))
         }
  | newline { if state = 2 || state = 3 then
                begin
                  reset_lexbuf (get_str()) !lnum lexbuf;
                  token lexbuf
                end
              else raise (Lexer_error (lex_long_range lexbuf,
                Printf.sprintf "Illegal directives")) }
  | _ { raise (Lexer_error (lex_long_range lexbuf,
          Printf.sprintf "Illegal directives")) }

and comments level = parse
  | "*/" { if level = 0 then token lexbuf
	   else comments (level-1) lexbuf }
  | "/*" { comments (level+1) lexbuf}
  | [^ '\n'] { comments level lexbuf }
  | "\n" { newline lexbuf; comments level lexbuf }
  | eof	 { raise (Lexer_error (lex_long_range lexbuf,
             Printf.sprintf "comments are not closed")) }

and string in_directive = parse
  | '"'  { if in_directive = false then
             STRING (get_str())
           else directive 2 lexbuf }
  | '\\' { add_str(escaped lexbuf); string in_directive lexbuf }
  | '\n' { add_str '\n'; newline lexbuf; string in_directive lexbuf }
  | eof  { raise (Lexer_error (lex_long_range lexbuf,
             Printf.sprintf "String is not terminated")) }
  | _    { add_str (Lexing.lexeme_char lexbuf 0); string in_directive lexbuf }

and escaped = parse
  | 'n'    { '\n' }
  | 't'    { '\t' }
  | '\\'   { '\\' }
  | '"'    { '\034'  }
  | '\''   { '\'' }
  | ['0'-'9']['0'-'9']['0'-'9']
    {
      let x = int_of_string(lexeme lexbuf) in
      if x > 255 then
        raise (Lexer_error (lex_long_range lexbuf,
          (Printf.sprintf "%s is an illegal escaped character constant" (lexeme lexbuf))))
      else
        Char.chr x
    }
  | [^ '"' '\\' 't' 'n' '\'']
    { raise (Lexer_error (lex_long_range lexbuf,
        (Printf.sprintf "%s is an illegal escaped character constant" (lexeme lexbuf) ))) }