201 lines
6 KiB
OCaml
201 lines
6 KiB
OCaml
{
|
|
open Lexing
|
|
open Parser
|
|
module Range = Util.Range
|
|
open Range
|
|
|
|
exception Lexer_error of Range.t * string
|
|
|
|
let reset_lexbuf (filename:string) (lnum:int) lexbuf : unit =
|
|
lexbuf.lex_curr_p <- {
|
|
pos_fname = filename;
|
|
pos_cnum = 0;
|
|
pos_bol = 0;
|
|
pos_lnum = lnum;
|
|
}
|
|
|
|
let newline lexbuf =
|
|
lexbuf.lex_curr_p <- { (lexeme_end_p lexbuf) with
|
|
pos_lnum = (lexeme_end_p lexbuf).pos_lnum + 1;
|
|
pos_bol = (lexeme_end lexbuf) }
|
|
|
|
(* Boilerplate to define exceptional cases in the lexer. *)
|
|
let unexpected_char lexbuf (c:char) : 'a =
|
|
raise (Lexer_error (Range.lex_range lexbuf,
|
|
Printf.sprintf "Unexpected character: '%c'" c))
|
|
|
|
(* Lexing reserved words *)
|
|
let reserved_words = [
|
|
(* Keywords *)
|
|
("null", NULL);
|
|
("void", TVOID);
|
|
("int", TINT);
|
|
("string", TSTRING);
|
|
("else", ELSE);
|
|
("if", IF);
|
|
("while", WHILE);
|
|
("return", RETURN);
|
|
("var", VAR);
|
|
("global", GLOBAL);
|
|
|
|
(* Symbols *)
|
|
( ";", SEMI);
|
|
( ",", COMMA);
|
|
( "{", LBRACE);
|
|
( "}", RBRACE);
|
|
( "+", PLUS);
|
|
( "-", DASH);
|
|
( "*", STAR);
|
|
( "=", EQ);
|
|
( "==", EQEQ);
|
|
( "!", BANG);
|
|
( "~", TILDE);
|
|
( "(", LPAREN);
|
|
( ")", RPAREN);
|
|
( "[", LBRACKET);
|
|
( "]", RBRACKET);
|
|
|
|
]
|
|
|
|
let (symbol_table : (string, Parser.token) Hashtbl.t) = Hashtbl.create 1024
|
|
let _ =
|
|
List.iter (fun (str,t) -> Hashtbl.add symbol_table str t) reserved_words
|
|
|
|
let create_token lexbuf =
|
|
let str = lexeme lexbuf in
|
|
try (Hashtbl.find symbol_table str)
|
|
with _ -> IDENT str
|
|
|
|
(* Lexing comments and strings *)
|
|
let string_buffer = ref (Bytes.create 2048)
|
|
let string_end = ref 0
|
|
let start_lex = ref (Range.start_of_range Range.norange)
|
|
|
|
let start_pos_of_lexbuf lexbuf : pos =
|
|
(Range.pos_of_lexpos (lexeme_start_p lexbuf))
|
|
|
|
let lex_long_range lexbuf : Range.t =
|
|
let end_p = lexeme_end_p lexbuf in
|
|
mk_range end_p.pos_fname (!start_lex) (pos_of_lexpos end_p)
|
|
|
|
let reset_str () = string_end := 0
|
|
|
|
let add_str ch =
|
|
let x = !string_end in
|
|
let buffer = !string_buffer
|
|
in
|
|
if x = Bytes.length buffer then
|
|
begin
|
|
let new_buffer = Bytes.create (x*2) in
|
|
Bytes.blit buffer 0 new_buffer 0 x;
|
|
Bytes.set new_buffer x ch;
|
|
string_buffer := new_buffer;
|
|
string_end := x+1
|
|
end
|
|
else
|
|
begin
|
|
Bytes.set buffer x ch;
|
|
string_end := x+1
|
|
end
|
|
|
|
let get_str () = Bytes.sub_string (!string_buffer) 0 (!string_end)
|
|
|
|
(* Lexing directives *)
|
|
let lnum = ref 1
|
|
}
|
|
|
|
(* Declare your aliases (let foo = regex) and rules here. *)
|
|
let newline = '\n' | ('\r' '\n') | '\r'
|
|
let lowercase = ['a'-'z']
|
|
let uppercase = ['A'-'Z']
|
|
let character = uppercase | lowercase
|
|
let whitespace = ['\t' ' ']
|
|
let digit = ['0'-'9']
|
|
let hexdigit = ['0'-'9'] | ['a'-'f'] | ['A'-'F']
|
|
|
|
rule token = parse
|
|
| eof { EOF }
|
|
|
|
| "/*" { start_lex := start_pos_of_lexbuf lexbuf; comments 0 lexbuf }
|
|
| '"' { reset_str(); start_lex := start_pos_of_lexbuf lexbuf; string false lexbuf }
|
|
| '#' { let p = lexeme_start_p lexbuf in
|
|
if p.pos_cnum - p.pos_bol = 0 then directive 0 lexbuf
|
|
else raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "# can only be the 1st char in a line.")) }
|
|
|
|
| lowercase (digit | character | '_')* { create_token lexbuf }
|
|
| digit+ | "0x" hexdigit+ { INT (Int64.of_string (lexeme lexbuf)) }
|
|
| whitespace+ { token lexbuf }
|
|
| newline { newline lexbuf; token lexbuf }
|
|
|
|
| ';' | ',' | '{' | '}' | '+' | '-' | '*' | '=' | "=="
|
|
| "!=" | '!' | '~' | '(' | ')' | '[' | ']'
|
|
{ create_token lexbuf }
|
|
|
|
| _ as c { unexpected_char lexbuf c }
|
|
|
|
and directive state = parse
|
|
| whitespace+ { directive state lexbuf }
|
|
| digit+ { if state = 0 then
|
|
(lnum := int_of_string (lexeme lexbuf);
|
|
directive 1 lexbuf)
|
|
else if state = 2 then directive 3 lexbuf
|
|
else raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "Illegal directives")) }
|
|
| '"' { if state = 1 then
|
|
begin
|
|
reset_str();
|
|
start_lex := start_pos_of_lexbuf lexbuf;
|
|
string true lexbuf
|
|
end
|
|
else raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "Illegal directives"))
|
|
}
|
|
| newline { if state = 2 || state = 3 then
|
|
begin
|
|
reset_lexbuf (get_str()) !lnum lexbuf;
|
|
token lexbuf
|
|
end
|
|
else raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "Illegal directives")) }
|
|
| _ { raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "Illegal directives")) }
|
|
|
|
and comments level = parse
|
|
| "*/" { if level = 0 then token lexbuf
|
|
else comments (level-1) lexbuf }
|
|
| "/*" { comments (level+1) lexbuf}
|
|
| [^ '\n'] { comments level lexbuf }
|
|
| "\n" { newline lexbuf; comments level lexbuf }
|
|
| eof { raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "comments are not closed")) }
|
|
|
|
and string in_directive = parse
|
|
| '"' { if in_directive = false then
|
|
STRING (get_str())
|
|
else directive 2 lexbuf }
|
|
| '\\' { add_str(escaped lexbuf); string in_directive lexbuf }
|
|
| '\n' { add_str '\n'; newline lexbuf; string in_directive lexbuf }
|
|
| eof { raise (Lexer_error (lex_long_range lexbuf,
|
|
Printf.sprintf "String is not terminated")) }
|
|
| _ { add_str (Lexing.lexeme_char lexbuf 0); string in_directive lexbuf }
|
|
|
|
and escaped = parse
|
|
| 'n' { '\n' }
|
|
| 't' { '\t' }
|
|
| '\\' { '\\' }
|
|
| '"' { '\034' }
|
|
| '\'' { '\'' }
|
|
| ['0'-'9']['0'-'9']['0'-'9']
|
|
{
|
|
let x = int_of_string(lexeme lexbuf) in
|
|
if x > 255 then
|
|
raise (Lexer_error (lex_long_range lexbuf,
|
|
(Printf.sprintf "%s is an illegal escaped character constant" (lexeme lexbuf))))
|
|
else
|
|
Char.chr x
|
|
}
|
|
| [^ '"' '\\' 't' 'n' '\'']
|
|
{ raise (Lexer_error (lex_long_range lexbuf,
|
|
(Printf.sprintf "%s is an illegal escaped character constant" (lexeme lexbuf) ))) }
|
|
|