#ifdef HAVE_CONFIG_H
# include <config.h>
#endif
#include "token.hh"
#include "hashmap.hh"
#include "error.hh"
#include "operator.hh"
#include "expr.hh"
#include <ctype.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>

Token::Token(const Token &token,
	     const Landmark &landmark, unsigned landmark_length)
  : _kind(token._kind), _cat(token._cat), _text(token._text),
    _landmark(landmark), _landmark_length(landmark_length)
{
  _v = token._v;
}

Token::Token(Operator kind, PermString string,
	     const Landmark &landmark, unsigned landmark_length)
  : _kind(kind), _cat(catString), _text(string),
    _landmark(landmark), _landmark_length(landmark_length)
{
  _v.string_capsule = string.capsule();
}

Token::Token(Logic *e, const Landmark &landmark, unsigned landmark_length)
  : _kind(opExpr), _cat(catExpr), _text(""),
    _landmark(landmark), _landmark_length(landmark_length)
{
  _v.expr = e;
}

Token::Token(Operator kind, PermString string)
  : _kind(kind), _cat(catString), _text(string)
{
  _v.string_capsule = string.capsule();
}

Token::Token(Operator kind, PermString name, OperatorGroup *oper)
  : _kind(kind), _cat(catOperator), _text(name)
{
  _v.oper = oper;
}


void
Token::print() const
{
  // errwriter << _landmark << _kind << " `" << _text << "'\n";
  String s = _landmark;
  fprintf(stderr, "%s: %d `%s'\n", s.cc(), (int)_kind, _text.cc());
}


bool
Token::starts_expr(bool no_infix) const
{
  if (_kind == opIdentifier || _kind == opExpr)
    return true;
  else if (_cat == catOperator) {
    OperatorGroup *og = _v.oper;
    // No infix operator? Then it starts an expression.
    // OR: Has prefix operator? Then it starts an expression.
    if (no_infix ? !og->find(false) : (bool)og->find(true))
      return true;
  }
  return false;
}


///////////////////////////////////////////////////////////////////////////////

static HashMap<PermString, Token> reserved_words;

int *Tokenizer::char_class;
int Tokenizer::char_class_storage[257];


void
Tokenizer::add_operator(Operator op, PermString name, int precedence,
			int flags, Operator terminator)
{
  Token *tp = reserved_words.findp(name);
  if (!tp) {
    reserved_words.insert(name, Token(op, name, new OperatorGroup));
    tp = reserved_words.findp(name);
  }
  
  tp->vopgroup()->add(op);
  op.set_data(name, precedence, flags, terminator);
}

// Depends on EOF == -1.

void
Tokenizer::static_initialize()
{
  if (!reserved_words.empty())
    return;
  
#define ADDKW(name, op) { \
	Token t(op, name); \
	Operator(op).set_data(name); \
	reserved_words.insert(name, t); \
      }
#define ADDEXPR(name, expr) { \
	Token t(expr); \
	reserved_words.insert(name, t); \
      }
#define ADDOPTERM(name, op, prec, flags, term) \
	add_operator(op, name, prec, flags, term)
#define ADDOP1(name, op, prec, flags) \
	ADDOPTERM(name, op, prec, flags, opNone)
#define ADDOP(name, op, prec) \
	ADDOPTERM(name, op, prec, 0, opNone)
  
  unsigned prefix = Operator::fPrefix;
  unsigned unary = Operator::fUnary;
  unsigned right = Operator::fRightAssoc;
  unsigned functionish = Operator::fFunctionish;
  
  ADDOPTERM("(", '(',		32, prefix|unary, ')');
  ADDOP1("flip", opFlip,	32, prefix|unary|functionish);
  ADDOP1("pin", opPin,		32, prefix|unary|functionish);
  
  ADDOP("**",	opPower,	29);
  
  ADDOP1("+",	opCopy,		28, prefix|unary);
  ADDOP1("-",	opNegate,	28, prefix|unary);
  ADDOP1("!",   '!',		28, prefix|unary);

  ADDOP("*",	'*',		25);
  ADDOP("/",	'/',		25);
  
  ADDOP("+",	'+',		24);
  ADDOP("-",	'-',		24);
  
  ADDOP("<",	opLt,		22);
  ADDOP("<=",	opLe,		22);
  ADDOP(">",	opGt,		22);
  ADDOP(">=",	opGe,		22);
  
  ADDOP("==",	opEq,		20);
  ADDOP("!=",	opNe,		20);
  
  ADDOP("&&",	opLogAnd,	16);
  ADDOP("||",	opLogOr,	15);
  
  ADDOPTERM("?", '?',		11, right, ':');
  
  ADDOP1("=",	opAssign,	10, right);
  ADDOP1("+=",	opAddAssign,	10, right);
  ADDOP1("-=",	opSubAssign,	10, right);
  ADDOP1("*=",	opMulAssign,	10, right);
  ADDOP1("/=",	opDivAssign,	10, right);
  ADDOP1("**=",	opPowerAssign,	10, right);
  
  ADDOP(",",	',',		3);
  
  ADDKW("//",	opEOLComment);
  ADDKW("/*",	opSlashStarComment);

  ADDKW("let",	opLet);
  ADDKW("in",	opIn);
  ADDKW("end",	opEnd);
  
  char_class = char_class_storage + 1;
  
  for (int i = -1; i < 256; i++)
    char_class[i] = 0;
  
  char *s;
  // Don't need to make s `unsigned char *' because it will only have
  // lower-ASCII characters in it!
  for (s = " \t\r\n\f\v"; *s; s++)
    char_class[*s] |= Whitespace;
  for (s = "L\'\""; *s; s++)
    char_class[*s] |= Literalstart;
  for (s = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_$"; *s; s++)
    char_class[*s] |= Wordstart | Word;
  for (s = "0123456789"; *s; s++)
    char_class[*s] |= Word;
  for (s = "0123456789."; *s; s++)
    char_class[*s] |= Numberstart | Number;
  for (s = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; *s; s++)
    char_class[*s] |= Number;
  for (s = "!#%&()*+,-./:;<=>?@[\\]^`{|}~"; *s; s++)
    char_class[*s] |= Punct;
  
  // Make sure opNone works out OK.
  assert(opNone == 0 && char_class[opNone] == 0);
}


inline bool
Tokenizer::tis_whitespace(int c)
{
  return (char_class[c] & Whitespace) != 0;
}

inline bool
Tokenizer::tis_word_start(int c)
{
  return (char_class[c] & Wordstart) != 0;
}

inline bool
Tokenizer::tis_word(int c)
{
  return (char_class[c] & Word) != 0;
}

inline bool
Tokenizer::tis_number_start(int c)
{
  return (char_class[c] & Numberstart) != 0;
}

inline bool
Tokenizer::tis_number(int c)
{
  return (char_class[c] & Number) != 0;
}

inline bool
Tokenizer::tis_literal_start(int c)
{
  return (char_class[c] & Literalstart) != 0;
}

inline bool
Tokenizer::tis_punct(int c)
{
  return (char_class[c] & Punct) != 0;
}


/////////////////////


Tokenizer::Tokenizer(TokenBufferFiller *tb_filler)
  : _tb(&_real_tb), _tb_filler(tb_filler),
    _s(new char[256]), _slen(0), _scap(256),
    _column(0), _line_non_ws(0)
{
  static_initialize();
  _unread_tb.make_empty(UNREAD_TB_SIZE);
  _unread_tb.pos = _unread_tb.end;
  
  _analyze_lines = _tb_filler->analyze_lines();

  _tb_filler->fill_token_buffer(_tb);
  if (_analyze_lines)
    analyze_lines(_tb);
}

Tokenizer::~Tokenizer()
{
  delete[] _s;
}


void
Tokenizer::analyze_lines(TokenBuffer *tb)
{
  const unsigned char *init = tb->pos;
  Landmark lm = landmark();
  LandmarkMap *map = lm.landmark_map();
  unsigned cp_pos = lm.cp();
  unsigned cp_len = lm.cp() + tb->length();
  for (const unsigned char *u = init; cp_pos < cp_len; u++, cp_pos++)
    if (*u == '\n')
      map->finish_line(cp_pos + 1);
    else if (*u == '\t')
      map->mark_tab(cp_pos);
}

void
Tokenizer::change_future_lines(PermString file, unsigned line)
{
  Landmark lm = landmark();
  LandmarkMap *map = lm.landmark_map();
  map->change_lines(lm.cp(), file, line);
}

int
Tokenizer::fill_tb()
{
  if (_tb == &_unread_tb) {
    _tb = &_real_tb;
    return read();
  } else if (_tb->null())
    return -1;
  else {
    _tb_filler->fill_token_buffer(_tb);
    if (_analyze_lines)
      analyze_lines(_tb);
    return read();
  }
}

inline int
Tokenizer::read()
{
  if (_tb->empty())
    return fill_tb();
  else {
    int c = *_tb->pos;
    _tb->pos++;
    
    if (c == '\n')
      _column = _line_non_ws = 0;
    else
      _column++;
    
    return c;
  }
}

void
Tokenizer::unread(int c)
{
  if (c < 0)
    return;
  
  if (_tb->pos > _tb->buffer) {
    _tb->pos--;
    assert(*_tb->pos == c);
  } else {
    if (_tb != &_unread_tb) {
      _unread_tb._landmark = landmark();
      _tb = &_unread_tb;
    }
    assert(_tb->pos + UNREAD_TB_SIZE > _tb->end);
    --_tb->pos;
    --_tb->_landmark;
    unsigned char *pos = (unsigned char *)_tb->pos;
    *pos = c;
  }
}


void
Tokenizer::increase_s()
{
  char *new_s = new char[_scap * 2];
  memcpy(new_s, _s, _slen);
  delete[] _s;
  _s = new_s;
  _scap *= 2;
}

inline void
Tokenizer::append0(int c)
{
  _s[_slen++] = c;
}

inline void
Tokenizer::append(int c)
{
  if (_slen >= _scap)
    increase_s();
  _s[_slen++] = c;
}

inline void
Tokenizer::clear()
{
  _slen = 0;
}


Operator
Tokenizer::read_literal(int c)
{
  // Accept, but ignore, an "L" denoting a long string constant.
  if (c == 'L') {
    c = read();
    if (c != '\'' && c != '\"') {
      unread(c);
      return 0;
    }
  }
  
  int terminator = c;
  int ischar = (c == '\'');
  
  while (1) {
    c = read();
    
    if (c == '\n') {
      
      warning(*this, "ANSI C forbids newline in %s constant",
		ischar ? "character" : "string");
      // But fall through to the end, where we append the \n to the string.
      
    } else if (c == EOF) {
      
      error(*this, "unexpected end of file");
      return opString;
      
    } else if (c == terminator) {
      
      return opString;
      
    } else if (c == '\\') {
      
      c = read();
      switch (c) {
	
       case '\n':
	// \(newline) contracts to nothing.
	continue;
	
       case '\'': case '\"': case '\\': case '?':
	break;
	
       case 'a': c = '\a'; break;
       case 'b': c = '\b'; break;
       case 'f': c = '\f'; break;
       case 'n': c = '\n'; break;
       case 'r': c = '\r'; break;
       case 't': c = '\t'; break;
       case 'v': c = '\v'; break;
	
       case 'x':
	 {
	   int i = read();
	   c = 0;
	   while (1) {
	     if (i >= '0' && i <= '9')
	       c = c * 16 + i - '0';
	     else if (i >= 'a' && i <= 'f')
	       c = c * 16 + i - 'a' + 10;
	     else if (i >= 'A' && i <= 'F')
	       c = c * 16 + i - 'A' + 10;
	     else
	       break;
	   }
	   break;
	 }
       
       case '0': case '1': case '2': case '3':
       case '4': case '5': case '6': case '7':
	 {
	   int i = c;
	   c = 0;
	   for (int j = 0; j < 3 && i >= '0' && i <= '7'; j++, i = read())
	     c = c * 8 + i - '0';
	   unread(i);
	   break;
	 }
       
       default:
	warning(*this, "unknown escape sequence");
	break;
	
      }
      
    }
    
    append(c);
  }
}


Operator
Tokenizer::read_word(int c)
{
  append(c);
  while (1) {
    c = read();
    
    if (tis_word(c)) {
      append(c);
      
    } else {
      unread(c);
      return opIdentifier;
    }
  }
}


Operator
Tokenizer::read_number(int c)
{
  // Returns a preprocessing number.
  
  if (c == '.') {
    c = read();
    if (!isdigit(c)) {
      unread(c);
      return 0;
    } else
      append('.');
  }
  
  append(c);
  for (c = read(); tis_number(c); c = read()) {
    append(c);
    if (c == 'e' || c == 'E') {
      c = read();
      if (c == '+' || c == '-')
	append(c);
      else
	unread(c);
    }
  }
  unread(c);
  
  return opNumber;
}


Operator
Tokenizer::read_punct(int c)
{
  int d;
  assert(_slen + 3 < _scap);
  
  append0(c);
  // Below, `$' stands for the input character.
  
  switch (c) {
    
   default:
    // illegal character
    return 0;
    
   case '~':
   case '(':
   case ')':
   case ',':
   case ';':
   case '?':
   case '{':
   case '}':
   case '[':
   case ']':
   case ':':
   case '\n': // \n is made into a Punct when processing #-directives.
    // Always single-character tokens
    break;
    
   case '!':
   case '^':
   case '%':
    // Either `$' or `$='.
    goto maybe_equal;
    
   case '<':
    // Either `<', `<=', `<<', `<<=', or `<->'.
    d = read();
    if (d == '-') {
      int e = read();
      if (e == '>') {
	append0(d);
	append0(e);
      } else {
	unread(e);
	unread(d);
      }
      break;
    }
    goto check_relational;
    
   case '>':
    // Either `>', `>=', `>>', or `>>='.
    d = read();
    goto check_relational;
    
   check_relational:
    // Either `$', `$$', `$=', or `$$='
    // (the relational and shift operators)
    if (d == c)
      append0(d);
    else
      unread(d);
    goto maybe_equal;
    
   maybe_equal:
    // Check for an equal sign
    c = read();
    if (c == '=')
      append0(c);
    else
      unread(c);
    break;
    
   case '|':
    // Either `$', `$=', `$$', or `$$$'
    d = read();
    if (d == c) {
      append0(d);
      d = read();
      if (d == c)
	append0(d);
      else
	unread(d);
    } else if (d == '=')
      append0(d);
    else
      unread(d);
    break;
    
   case '&':
   case '+':
   case '*':
    // Either `$', `$=', or `$$'
    d = read();
    if (d == c || d == '=')
      append0(d);
    else
      unread(d);
    break;
    
   case '#':
    // Either `$' or `$$'
    d = read();
    if (d == c)
      append0(d);
    else
      unread(d);
    break;
    
   case '-':
    // Special case: Either `-', `--', `-=', or `->'
    c = read();
    if (c == '-' || c == '=' || c == '>')
      append0(c);
    else
      unread(c);
    break;
    
   case '=':
    // Special case: Either `=', `==', or `==>'
    c = read();
    if (c == '=') {
      append0(c);
      c = read();
      if (c == '>') {
	append0(c);
	break;
      }
    }
    unread(c);
    break;
    
   case '/':
    // Special case: either `/', `//', `/*', or `/='
    c = read();
    if (c == '/' || c == '*' || c == '=')
      append0(c);
    else
      unread(c);
    break;
    
   case '.':
    // Only two forms allowed: `.' and `...'. Two dots are expanded to three.
    c = read();
    if (c == '.') {
      append0(c);
      append0(c);
      c = read();
      if (c != '.')
	// FIXME report error?
	unread(c);
    } else
      unread(c);
    break;
    
  }
  
  // The punctuation token is in s.
  return opPunctuation;
}


bool
Tokenizer::handle_hash()
{
  return 0;
  
  //if (_line_non_ws != 1)
  //return 0;
  
  // deal now with `#'.
  // Make \n a special token for end-of-line; it won't be eaten as space.
  //char_class['\n'] = Punct;
  
  // Now read either `# NUMBER ["string"]' or `# line NUMBER ["string"]'.
  
  /*  Token t = get_token();
  IntExpr *ie;
  PermString new_file;
  unsigned new_line;
  
  if (t.is(opIdentifier)) {
    if (t.vstring() != "line")
      goto error;
    t = get_token();
  }
  
  if (!t.is(opExpr))
    goto error;
  
  ie = t.vexpr()->cast_int();
  if (!ie)
    goto error;
  
  new_line = ie->value();
  t = get_token();
  
  // FIXME maybe? move to a string-typed literal?
  if (t.is(opString)) {
    new_file = t.vstring();
    t = get_token();
  }
  
 await_nl:
  while (t.kind() != '\n' && t)
    t = get_token();

  if (_analyze_lines)
    change_future_lines(new_file, new_line);
  
  // return the binding of '\n' to normal.
  char_class['\n'] = Whitespace;
  // reset _line_non_ws to 0, since we just finished a line. If we don't do
  // this, we can't parse two adjacent #line directives.
  _line_non_ws = 0;
  return 1;
  
 error:
  error(*this, "unknown # directive");
  goto await_nl;*/
}


Token
Tokenizer::handle_number(const Landmark &start_landmark,
			 unsigned landmark_length)
{
  bool force_int = false;
  bool force_real = false;
  bool force_hex = _s[0] == '0' && (_s[1] == 'x' || _s[1] == 'X');
  bool is_unsigned = false;
  bool is_long = false;
  bool is_float = false;
  
  // handle suffixes
  for (; _slen >= 0; _slen--)
    switch (_s[_slen - 1]) {
      
     case 'L': case 'l':
      if (is_long || force_real)
	warning(*this, "garbage at end of number");
      is_long = true;
      break;
      
     case 'U': case 'u':
      if (is_unsigned || force_real)
	warning(*this, "garbage at end of number");
      warning(*this, "unsigned not yet supported");
      force_int = is_unsigned = true;
      break;
      
     case 'F': case 'f':
      if (force_hex)
	goto main;
      if (force_int)
	warning(*this, "garbage at end of number");
      force_real = is_float = true;
      break;
      
     default:
      goto main;
      
    }
  
 main:
  char *rest;
  _s[_slen] = 0;
  errno = 0;
  
  double val = strtol(_s, &rest, 0);
  if (errno == ERANGE)
    warning(*this, "integer constant out of range");
  
  if (force_real || *rest == '.' || *rest == 'e' || *rest == 'E')
    val = strtod(_s, &rest);
  else if (*rest)
    error(*this, "floating constant or garbage at end of number");
  
  // FIXME long? type?
  return Token(new ConstExpr(val), start_landmark, landmark_length);
}


void
Tokenizer::check_s_for_hash()
{
  char *s = _s;
  
  while (isspace(*s)) s++;
  if (*s++ != '#') return;
  
  while (isspace(*s)) s++;
  if (s[0] == 'l' && s[1] == 'i' && s[2] == 'n' && s[3] == 'e') s += 4;
  while (isspace(*s)) s++;
  
  if (!isdigit(*s)) return;
  // Don't subtract 1 because we've already read the `\n' which terminates
  // this line; we're effectively, therefore, on the next line.
  PermString new_file;
  unsigned new_line = strtol(s, &s, 10);
  
  while (isspace(*s)) s++;
  if (*s++ == '\"') {
    char *start = s;
    // FIXME backslashes in filenames
    while (*s && *s != '\"') s++;
    new_file = PermString(start, s - start);
  }
  
  if (_analyze_lines)
    change_future_lines(new_file, new_line);
}


Token
Tokenizer::get_token()
{
  while (1) {
    
    clear();
    int c = read();
    while (tis_whitespace(c))
      c = read();
    _line_non_ws++;
    
    Landmark start_landmark = landmark() - 1;
    Operator token_kind = 0;
    if (!token_kind && tis_literal_start(c))
      token_kind = read_literal(c);
    if (!token_kind && tis_word_start(c))
      token_kind = read_word(c);
    if (!token_kind && tis_number_start(c))
      token_kind = read_number(c);
    if (!token_kind && tis_punct(c))
      token_kind = read_punct(c);
    unsigned landmark_length = landmark().cp() - start_landmark.cp();
    
    if (token_kind == 0) {
      if (c == EOF)
	return Token(0, "", start_landmark, landmark_length);
      else
	error(*this, "illegal character `%c'", c);
      continue;
    }
    
    PermString string(_s, _slen);
    
    if (token_kind == opIdentifier || token_kind == opPunctuation) {
      
      // Look up identifiers or punctuations in our reserved words hash table.
      Token *tokp = reserved_words.findp(string);
      
      if (tokp) {
	
	token_kind = tokp->kind();
	
	// Check for assign variants: because `min=' won't actually be parsed
	// as a single token.
	if (tokp->is_operator() && tokp->voperator().assign_variant()) {
	  c = read();
	  if (c == '=') {
	    append(c);
	    string = PermString(_s, _slen);
	    tokp = reserved_words.findp(string);
	    assert(tokp);
	  } else
	    unread(c);
	}
	
	/*if (tokp->kind() == opExpr)
	  return Token(tokp->vexpr()->clone(landmark()));
	  
	  else*/
	// early exit if we've found a real token
	if (!token_kind.tokenizer_internal())
	  return Token(*tokp, start_landmark, landmark_length);
	
      } else if (token_kind == opPunctuation) {
	// If we don't find a punctuation in the table,
	// it was a one-character punctuation, whose value is just that
	// character.
	assert(_slen == 1);
	token_kind = _s[0];
      }
      
    }
    
    switch ((int)token_kind) {
      
     case opEOLComment:
      do {
	c = read();
      } while (c != '\n' && c != EOF);
      break;
      
     case opSlashStarComment:
      do {
	while (c != '*' && c != EOF)
	  c = read();
	while (c == '*')
	  c = read();
      } while (c != EOF && c != '/');
      break;
      
     case opNumber:
      return handle_number(start_landmark, landmark_length);
      
     case '#':
      if (!handle_hash())
	return Token(token_kind, string, start_landmark, landmark_length);
      break;
      
     case opIdentifier:
      return Token(opIdentifier, string, start_landmark, landmark_length);

     default:
      return Token(token_kind, string, start_landmark, landmark_length);
      
    }
    
  }
}


//
// TOKENBUFFER
//

TokenBuffer::TokenBuffer()
  : _free_buffer(0), buffer(0), end_buffer(0), pos(0), end(0)
{
}

TokenBuffer::~TokenBuffer()
{
  release();
}

void
TokenBuffer::delete_buffer(void *cv)
{
  unsigned char *cuc = static_cast<unsigned char *>(cv);
  delete[] cuc;
}

void
TokenBuffer::release()
{
  if (_free_buffer && buffer)
    (*_free_buffer)(const_cast<unsigned char *>(buffer));
  buffer = end_buffer = 0;
  pos = end = 0;
}

void
TokenBuffer::reset(unsigned len, const Landmark &landmark)
{
  pos = buffer;
  end = buffer + len;
  _landmark = landmark;
}

void
TokenBuffer::reset(const char *str, unsigned len, const Landmark &landmark)
{
  release();
  buffer = reinterpret_cast<const unsigned char *>(str);
  end_buffer = buffer + len;
  _free_buffer = 0;
  reset(len, landmark);
}

void
TokenBuffer::make_empty(unsigned len)
{
  release();
  unsigned char *str = new unsigned char[len];
  _free_buffer = delete_buffer;
  buffer = str;
  end_buffer = str + len;
  pos = end = buffer;
}


FileTokenBufferFiller::FileTokenBufferFiller(const char *name, FILE *f)
  : _f(f), _landmark(PermString(name))
{
}

void
FileTokenBufferFiller::fill_token_buffer(TokenBuffer *tb)
{
  if (feof(_f)) {
    tb->release();
    return;
  }
  
  if (tb->null())
    tb->make_empty(4096);
  unsigned char *buffer = const_cast<unsigned char *>(tb->buffer);
  
  unsigned l = fread(buffer, 1, tb->buffer_size(), _f);
  tb->reset(l, _landmark);
  _landmark += l;
}


StringTokenBufferFiller::StringTokenBufferFiller(const char *name,
						 const char *data)
  : _data(data), _landmark(PermString(name)), _done(false)
{
}

void
StringTokenBufferFiller::fill_token_buffer(TokenBuffer *tb)
{
  if (_done)
    tb->release();
  else {
    tb->reset(_data, strlen(_data), _landmark);
    _done = true;
  }
}