(* JungTaek Kim Copyright(c) 2000-2004 KAIST/SNU Research On Program Analysis System (National Creative Research Initiative Center 1998-2003) http://ropas.snu.ac.kr/n All rights reserved. This file is distributed under the terms of an Open Source License. *) (* Based on Objective Caml Code *) { open Misc open Nparser type error = Illegal_character | Unterminated_comment | Unterminated_string | Too_many_character exception Error of error * int * int (* For nested comments *) val comment_depth = ref 0 (* The table of keywords *) val keyword_table = create_hashtable 149 [ ("and", AND), ("as", AS), ("case", CASE), ("do", DO), ("else", ELSE), ("end", END), ("exception", EXCEPTION), ("fn", FN), ("for", FOR), ("fun", FUN), ("functor", FUNCTOR), ("handle", HANDLE), ("if", IF), ("in", IN), ("include", INCLUDE), ("let", LET), ("local", LOCAL), ("of", OF), ("open", OPEN), ("raise", RAISE), ("rec", REC), ("ref", REF), ("sig", SIG), ("signature", SIGNATURE), ("struct", STRUCT), ("structure", STRUCTURE), ("then", THEN), ("type", TYPE), ("val", VAL), ("where", WHERE), ("while", WHILE), ("true", UID("True")), (* cookcu *) ("false", UID("False")), (* cookcu *) ("andalso", ANDALSO), (* op *) ("orelse", ORELSE), (* op *) ("not", NOT), (* op *) ("nil", NIL), (* op *) ("Nil", NIL) (* op *) (* removed ("div", INFIX3("div")), (* op *) ("mod", INFIX3("mod")), (* op *) ("land", INFIX1("land")), (* op *) ("lor", INFIX1("lor")), (* op *) ("lxor", INFIX1("lxor")), (* op *) ("lsl", INFIX1("lsl")), (* op *) ("lsr", INFIX1("lsr")) (* op *) *) ] (* To buffer string literals *) val initial_string_buffer = String.create 256 val string_buff = ref initial_string_buffer val string_index = ref 0 fun reset_string_buffer () = (string_buff := initial_string_buffer; string_index := 0) fun store_string_char c = (if !string_index >= String.length (!string_buff) then ( let val new_buff = String.create (String.length (!string_buff) * 2) in String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff)); string_buff := new_buff end ); String.unsafe_set (!string_buff) (!string_index) c; incr string_index) fun get_stored_string () = let val s = String.sub (!string_buff) 0 (!string_index) in string_buff := initial_string_buffer; s end (* To translate escape sequences *) val char_for_backslash = case Sys.os_type of "MacOS" => (fn 'n' => '\013' | 'r' => '\010' | 'b' => '\008' | 't' => '\009' | c => c) | _ => (fn 'n' => '\010' | 'r' => '\013' | 'b' => '\008' | 't' => '\009' | c => c) fun char_for_decimal_code lexbuf i = let val c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in Char.chr(land c 0xFF) end (* To store the position of the beginning of a string and comment *) val string_start_pos = ref 0 and comment_start_pos = ref 0 (* Error report *) open Format val report_error = fn Illegal_character => print_string "Illegal character" | Unterminated_comment => print_string "Comment not terminated" | Unterminated_string => print_string "String literal not terminated" | Too_many_character => print_string "Too many charaters in Char literal" exception Not_Found fun wordend m i str = if i = m then raise Not_Found else let val c = String.get str i in if c = ' ' orelse c = '\010' orelse c = '\013' orelse c = '\009' orelse c = '\012' orelse c = '.' then i else wordend m (i+1) str end fun getword str = let val p = wordend (String.length str) 0 str in String.sub str 0 p end } val blank = [' ' '\010' '\013' '\009' '\012'] val hangul = ['\176'-'\200']['\161'-'\254'] (* (* KS X 1001 a.k.a. KS C 5601 or euc-kr *) let hangulSyllable = (['\176'-'\200']['\161'-'\254'])+ (* KS X 1005-1 a.k.a. KS C 5700, unicode, or ISO/IEC 10646-1 *) let uHangulSyllable = (['\172'-'\214']['\000'-'\255']|'\215'['\000'-'\163'])+ *) val uppercase = ['A'-'Z'] val lowercase = ['a'-'z'] | hangul val idchar = ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] | hangul val decimal_literal = ['0'-'9']+ val hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+ val oct_literal = '0' ['o' 'O'] ['0'-'7']+ val bin_literal = '0' ['b' 'B'] ['0'-'1']+ val float_literal = ['0'-'9']+ ('.' ['0'-'9']+)? (['e' 'E'] ['+' '-']? ['0'-'9']+)? val symbolchar = ['\\' '!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' '#'] val lid = lowercase idchar * val uid = uppercase idchar * | "_" idchar+ rule token = parse blank + { token lexbuf } | lid { let val s = Lexing.lexeme lexbuf in (Hashtbl.find keyword_table s) handle Not_found => LID s end } | uid blank * "." { PATH (getword (Lexing.lexeme lexbuf)) } | uid { let val s = Lexing.lexeme lexbuf in (Hashtbl.find keyword_table s) handle Not_found => UID s end } | decimal_literal | hex_literal | oct_literal | bin_literal { INT (int_of_string(Lexing.lexeme lexbuf)) } | float_literal { REAL (Lexing.lexeme lexbuf) } | "\"" { reset_string_buffer(); let val string_start = Lexing.lexeme_start lexbuf in string_start_pos := string_start; string lexbuf; Interop.lexing_field_update_lex_start_pos (lexbuf, string_start - (Interop.lexing_field_lex_abs_pos lexbuf)); STRING (get_stored_string()) end } | "'" [^ '\\' '\''] "'" { CHAR(Lexing.lexeme_char lexbuf 1) } | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { CHAR(char_for_decimal_code lexbuf 2) } | "//"[^'\n']* '\n' { token lexbuf } | "(*" { comment_depth := 1; comment_start_pos := Lexing.lexeme_start lexbuf; comment lexbuf; token lexbuf } | "_" { UNDERSCORE } | "(" { LPAREN } | ")" { RPAREN } | "{" { LBRACE } | "}" { RBRACE } | "[" { LBRACKET } | "]" { RBRACKET } | "[|" { LBRACKETBAR } | "|]" { BARRBRACKET } | ":" { COLON } | ";" { SEMI } | "|" { BAR } | "," { COMMA } | "." { DOT } | "..." { DOTDOTDOT } | "->" { ARROW } | "<-" { REVERSEARROW } | "=>" { DOUBLEARROW } | "'" { QUOTE } | ":=" { ASSIGN } (* op *) | "!" { BANG } (* op *) | "*" { STAR } (* op *) | "=" { EQUAL } (* op *) | "+" { PLUS } (* op *) | "-" { MINUS } (* op *) | "#" { SHARP } (* toplevel directive *) | "::" { COLONCOLON } (* op *) | "&&" { ANDALSO } (* op *) | "||" { ORELSE } (* op *) | "++" { PLUSPLUS } (* op *) | "--" { MINUSMINUS } (* op *) | "+=" { INFIX0(Lexing.lexeme lexbuf) } (* op *) | "-=" { INFIX0(Lexing.lexeme lexbuf) } (* op *) | "*=" { INFIX0(Lexing.lexeme lexbuf) } (* op *) | "/=" { INFIX0(Lexing.lexeme lexbuf) } (* op *) | ">>" { INFIX1(Lexing.lexeme lexbuf) } (* op *) | "<<" { INFIX1(Lexing.lexeme lexbuf) } (* op *) | ['!' '?' '~'] symbolchar * { PREFIX(Lexing.lexeme lexbuf) } | ['\\' '=' '<' '>' '|' '&' '$' '#'] symbolchar * { INFIX1(Lexing.lexeme lexbuf) } | ['@' '^'] symbolchar * { INFIX2(Lexing.lexeme lexbuf) } | ['+' '-'] symbolchar + { INFIX3(Lexing.lexeme lexbuf) } | "**" symbolchar * { INFIX5(Lexing.lexeme lexbuf) } | ['*' '/' '%'] symbolchar * { INFIX4(Lexing.lexeme lexbuf) } | eof { EOF } | ";;" { SEMISEMI } | _ { raise (Error(Illegal_character, Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) } and comment = parse "(*" { comment_depth := succ !comment_depth; comment lexbuf } | "*)" { comment_depth := pred !comment_depth; if !comment_depth > 0 then comment lexbuf } (* | "\"" { reset_string_buffer(); string_start_pos := Lexing.lexeme_start lexbuf; string lexbuf; string_buff := initial_string_buffer; comment lexbuf } | "''" { comment lexbuf } | "'" [^ '\\' '\''] "'" { comment lexbuf } | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'" { comment lexbuf } | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" { comment lexbuf } *) | eof { raise (Error (Unterminated_comment, !comment_start_pos, !comment_start_pos+2)) } | _ { comment lexbuf } and string = parse '"' { () } | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] * { string lexbuf } | '\\' ['\\' '"' 'n' 't' 'b' 'r'] { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); string lexbuf } | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] { store_string_char(char_for_decimal_code lexbuf 1); string lexbuf } | eof { raise (Error (Unterminated_string, !string_start_pos, !string_start_pos+1)) } | _ { store_string_char(Lexing.lexeme_char lexbuf 0); string lexbuf }