(*
 JungTaek Kim

 Copyright(c) 2000-2004 KAIST/SNU Research On Program Analysis System
 (National Creative Research Initiative Center 1998-2003)
 http://ropas.snu.ac.kr/n

 All rights reserved. This file is distributed under the terms of
 an Open Source License.
*)

(* Based on Objective Caml Code                            *)

{
open Misc
open Nparser

type error =
    Illegal_character
  | Unterminated_comment
  | Unterminated_string
  | Too_many_character

exception Error of error * int * int

(* For nested comments *)

val comment_depth = ref 0

(* The table of keywords *)

val keyword_table =
  create_hashtable 149 [
    ("and", AND),
    ("as", AS),
    ("case", CASE),
    ("do", DO),
    ("else", ELSE),
    ("end", END),
    ("exception", EXCEPTION),
    ("fn", FN),
    ("for", FOR),
    ("fun", FUN),
    ("functor", FUNCTOR),
    ("handle", HANDLE),
    ("if", IF),
    ("in", IN),
    ("include", INCLUDE),
    ("let", LET),
    ("local", LOCAL),
    ("of", OF),
    ("open", OPEN),
    ("raise", RAISE),
    ("rec", REC),
    ("ref", REF),
    ("sig", SIG),
    ("signature", SIGNATURE),
    ("struct", STRUCT),
    ("structure", STRUCTURE),
    ("then", THEN),
    ("type", TYPE),
    ("val", VAL),
    ("where", WHERE),
    ("while", WHILE),
	("true", UID("True")), (* cookcu *)
    ("false", UID("False")),   (* cookcu *)
    ("andalso", ANDALSO),			(* op *) 
    ("orelse", ORELSE),			(* op *) 
    ("not", NOT),				(* op *) 
    ("nil", NIL),				(* op *) 
    ("Nil", NIL)				(* op *) 
    (* removed 
    ("div", INFIX3("div")),		(* op *) 
    ("mod", INFIX3("mod")),		(* op *) 
    ("land", INFIX1("land")),		(* op *) 
    ("lor", INFIX1("lor")),		(* op *) 
    ("lxor", INFIX1("lxor")),		(* op *) 
    ("lsl", INFIX1("lsl")),		(* op *) 
    ("lsr", INFIX1("lsr"))		(* op *) 
    *)
]

(* To buffer string literals *)

val initial_string_buffer = String.create 256
val string_buff = ref initial_string_buffer
val string_index = ref 0

fun reset_string_buffer () =
  (string_buff := initial_string_buffer;
   string_index := 0)

fun store_string_char c =
  (if !string_index >= String.length (!string_buff) then 
   (
    let val new_buff = String.create (String.length (!string_buff) * 2)
	in
      String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
      string_buff := new_buff
	end
   );
   String.unsafe_set (!string_buff) (!string_index) c;
   incr string_index)

fun get_stored_string () =
  let val s = String.sub (!string_buff) 0 (!string_index)
  in
    string_buff := initial_string_buffer; s
  end

(* To translate escape sequences *)

val char_for_backslash =
  case Sys.os_type of
    "MacOS" =>
     (fn
        'n' => '\013'
      | 'r' => '\010'
      | 'b' => '\008'
      | 't' => '\009'
      | c   => c)
  | _ =>
     (fn
        'n' => '\010'
      | 'r' => '\013'
      | 'b' => '\008'
      | 't' => '\009'
      | c   => c)

fun char_for_decimal_code lexbuf i =
  let val c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
              10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
                (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48)
  in  
    Char.chr(land c 0xFF)
  end

(* To store the position of the beginning of a string and comment *)
val string_start_pos = ref 0
and comment_start_pos = ref 0

(* Error report *)

open Format

val report_error = fn
    Illegal_character =>
      print_string "Illegal character"
  | Unterminated_comment =>
      print_string "Comment not terminated"
  | Unterminated_string =>
      print_string "String literal not terminated"
  | Too_many_character =>
      print_string "Too many charaters in Char literal"

exception Not_Found

fun wordend m i str =
  if i = m then raise Not_Found
  else
    let val c = String.get str i
	in
      if c = ' ' orelse c = '\010' orelse c = '\013' orelse c = '\009' orelse
         c = '\012' orelse c = '.' then i
      else wordend m (i+1) str
	end

fun getword str =
  let val p = wordend (String.length str) 0 str
  in
    String.sub str 0 p
  end


}

val blank = [' ' '\010' '\013' '\009' '\012']
val hangul = ['\176'-'\200']['\161'-'\254']
(*
    (* KS X 1001 a.k.a. KS C 5601 or euc-kr *)
let hangulSyllable = (['\176'-'\200']['\161'-'\254'])+
    (* KS X 1005-1 a.k.a. KS C 5700, unicode, or ISO/IEC 10646-1 *)
let uHangulSyllable = (['\172'-'\214']['\000'-'\255']|'\215'['\000'-'\163'])+
*)
val uppercase = ['A'-'Z']
val lowercase = ['a'-'z'] | hangul
val idchar = 
  ['A'-'Z' 'a'-'z' '_' '\'' '0'-'9'] | hangul
val decimal_literal = ['0'-'9']+
val hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+
val oct_literal = '0' ['o' 'O'] ['0'-'7']+
val bin_literal = '0' ['b' 'B'] ['0'-'1']+
val float_literal =
  ['0'-'9']+ ('.' ['0'-'9']+)? (['e' 'E'] ['+' '-']? ['0'-'9']+)?
val symbolchar =
  ['\\' '!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~' '#']
val lid = lowercase idchar *
val uid = uppercase idchar * | "_" idchar+

rule token = parse
    blank +
      { token lexbuf }
  | lid
      { let val s = Lexing.lexeme lexbuf
	    in
          (Hashtbl.find keyword_table s) handle Not_found => LID s 
		end }
  | uid blank * "."
      { PATH (getword (Lexing.lexeme lexbuf)) }
  | uid
      { let val s = Lexing.lexeme lexbuf
	    in
          (Hashtbl.find keyword_table s) handle Not_found => UID s
		end }
  | decimal_literal | hex_literal | oct_literal | bin_literal
      { INT (int_of_string(Lexing.lexeme lexbuf)) }
  | float_literal
      { REAL (Lexing.lexeme lexbuf) }
  | "\""
      { reset_string_buffer();
        let val string_start = Lexing.lexeme_start lexbuf
		 in string_start_pos := string_start;
            string lexbuf;
		    Interop.lexing_field_update_lex_start_pos
		     (lexbuf, string_start - (Interop.lexing_field_lex_abs_pos lexbuf));
            STRING (get_stored_string())
		end }
  | "'" [^ '\\' '\''] "'"
      { CHAR(Lexing.lexeme_char lexbuf 1) }
  | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
      { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
  | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
      { CHAR(char_for_decimal_code lexbuf 2) }
  | "//"[^'\n']* '\n' { token lexbuf }
  | "(*"
      { comment_depth := 1;
        comment_start_pos := Lexing.lexeme_start lexbuf;
        comment lexbuf;
        token lexbuf }
  | "_"   { UNDERSCORE }
  | "("   { LPAREN }
  | ")"   { RPAREN }
  | "{"   { LBRACE }
  | "}"   { RBRACE }
  | "["   { LBRACKET }
  | "]"   { RBRACKET }
  | "[|"  { LBRACKETBAR }
  | "|]"  { BARRBRACKET }
  | ":"   { COLON }
  | ";"   { SEMI }
  | "|"   { BAR }
  | ","   { COMMA }
  | "."   { DOT }
  | "..." { DOTDOTDOT }
  | "->"  { ARROW }
  | "<-"  { REVERSEARROW }
  | "=>"  { DOUBLEARROW }
  | "'"   { QUOTE }
  | ":="  { ASSIGN }		(* op *)
  | "!"   { BANG }		(* op *)
  | "*"   { STAR }			(* op *)
  | "="   { EQUAL }			(* op *)
  | "+"   { PLUS }			(* op *)
  | "-"   { MINUS }			(* op *)
  | "#"   { SHARP }         (* toplevel directive *)
  | "::"  { COLONCOLON }		(* op *)
  | "&&"  { ANDALSO }			(* op *)
  | "||"  { ORELSE }			(* op *)
  | "++"  { PLUSPLUS }			(* op *)
  | "--"  { MINUSMINUS }			(* op *)
  | "+="  { INFIX0(Lexing.lexeme lexbuf) }	(* op *)
  | "-="  { INFIX0(Lexing.lexeme lexbuf) }	(* op *)
  | "*="  { INFIX0(Lexing.lexeme lexbuf) }	(* op *)
  | "/="  { INFIX0(Lexing.lexeme lexbuf) }	(* op *)
  | ">>"  { INFIX1(Lexing.lexeme lexbuf) }	(* op *)
  | "<<"  { INFIX1(Lexing.lexeme lexbuf) }	(* op *)
  | ['!' '?' '~'] symbolchar *
            { PREFIX(Lexing.lexeme lexbuf) }
  | ['\\' '=' '<' '>' '|' '&' '$' '#'] symbolchar *
            { INFIX1(Lexing.lexeme lexbuf) }
  | ['@' '^'] symbolchar *
            { INFIX2(Lexing.lexeme lexbuf) }
  | ['+' '-'] symbolchar +
            { INFIX3(Lexing.lexeme lexbuf) }
  | "**" symbolchar *
            { INFIX5(Lexing.lexeme lexbuf) }
  | ['*' '/' '%'] symbolchar *
            { INFIX4(Lexing.lexeme lexbuf) }
  | eof   { EOF }
  | ";;"  { SEMISEMI }
  | _
      { raise (Error(Illegal_character,
                     Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) }

and comment = parse
    "(*"
      { comment_depth := succ !comment_depth; comment lexbuf }
  | "*)"
      { comment_depth := pred !comment_depth;
        if !comment_depth > 0 then comment lexbuf }
(*
  | "\""
      { reset_string_buffer();
        string_start_pos := Lexing.lexeme_start lexbuf;
        string lexbuf;
        string_buff := initial_string_buffer;
        comment lexbuf }
  | "''"
      { comment lexbuf }
  | "'" [^ '\\' '\''] "'"
      { comment lexbuf }
  | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'"
      { comment lexbuf }
  | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
      { comment lexbuf }
*)
  | eof
      { raise (Error (Unterminated_comment,
                      !comment_start_pos, !comment_start_pos+2)) }
  | _
      { comment lexbuf }

and string = parse
    '"'
      { () }
  | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] *
      { string lexbuf }
  | '\\' ['\\' '"' 'n' 't' 'b' 'r']
      { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
        string lexbuf }
  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
      { store_string_char(char_for_decimal_code lexbuf 1);
         string lexbuf }
  | eof
      { raise (Error (Unterminated_string,
                      !string_start_pos, !string_start_pos+1)) }
  | _
      { store_string_char(Lexing.lexeme_char lexbuf 0);
        string lexbuf }