#ifdef HAVE_CONFIG_H # include #endif #include "token.hh" #include "hashmap.hh" #include "error.hh" #include "operator.hh" #include "expr.hh" #include #include #include #include Token::Token(const Token &token, const Landmark &landmark, unsigned landmark_length) : _kind(token._kind), _cat(token._cat), _text(token._text), _landmark(landmark), _landmark_length(landmark_length) { _v = token._v; } Token::Token(Operator kind, PermString string, const Landmark &landmark, unsigned landmark_length) : _kind(kind), _cat(catString), _text(string), _landmark(landmark), _landmark_length(landmark_length) { _v.string_capsule = string.capsule(); } Token::Token(Logic *e, const Landmark &landmark, unsigned landmark_length) : _kind(opExpr), _cat(catExpr), _text(""), _landmark(landmark), _landmark_length(landmark_length) { _v.expr = e; } Token::Token(Operator kind, PermString string) : _kind(kind), _cat(catString), _text(string) { _v.string_capsule = string.capsule(); } Token::Token(Operator kind, PermString name, OperatorGroup *oper) : _kind(kind), _cat(catOperator), _text(name) { _v.oper = oper; } void Token::print() const { // errwriter << _landmark << _kind << " `" << _text << "'\n"; String s = _landmark; fprintf(stderr, "%s: %d `%s'\n", s.cc(), (int)_kind, _text.cc()); } bool Token::starts_expr(bool no_infix) const { if (_kind == opIdentifier || _kind == opExpr) return true; else if (_cat == catOperator) { OperatorGroup *og = _v.oper; // No infix operator? Then it starts an expression. // OR: Has prefix operator? Then it starts an expression. if (no_infix ? !og->find(false) : (bool)og->find(true)) return true; } return false; } /////////////////////////////////////////////////////////////////////////////// static HashMap reserved_words; int *Tokenizer::char_class; int Tokenizer::char_class_storage[257]; void Tokenizer::add_operator(Operator op, PermString name, int precedence, int flags, Operator terminator) { Token *tp = reserved_words.findp(name); if (!tp) { reserved_words.insert(name, Token(op, name, new OperatorGroup)); tp = reserved_words.findp(name); } tp->vopgroup()->add(op); op.set_data(name, precedence, flags, terminator); } // Depends on EOF == -1. void Tokenizer::static_initialize() { if (!reserved_words.empty()) return; #define ADDKW(name, op) { \ Token t(op, name); \ Operator(op).set_data(name); \ reserved_words.insert(name, t); \ } #define ADDEXPR(name, expr) { \ Token t(expr); \ reserved_words.insert(name, t); \ } #define ADDOPTERM(name, op, prec, flags, term) \ add_operator(op, name, prec, flags, term) #define ADDOP1(name, op, prec, flags) \ ADDOPTERM(name, op, prec, flags, opNone) #define ADDOP(name, op, prec) \ ADDOPTERM(name, op, prec, 0, opNone) unsigned prefix = Operator::fPrefix; unsigned unary = Operator::fUnary; unsigned right = Operator::fRightAssoc; unsigned functionish = Operator::fFunctionish; ADDOPTERM("(", '(', 32, prefix|unary, ')'); ADDOP1("flip", opFlip, 32, prefix|unary|functionish); ADDOP1("pin", opPin, 32, prefix|unary|functionish); ADDOP("**", opPower, 29); ADDOP1("+", opCopy, 28, prefix|unary); ADDOP1("-", opNegate, 28, prefix|unary); ADDOP1("!", '!', 28, prefix|unary); ADDOP("*", '*', 25); ADDOP("/", '/', 25); ADDOP("+", '+', 24); ADDOP("-", '-', 24); ADDOP("<", opLt, 22); ADDOP("<=", opLe, 22); ADDOP(">", opGt, 22); ADDOP(">=", opGe, 22); ADDOP("==", opEq, 20); ADDOP("!=", opNe, 20); ADDOP("&&", opLogAnd, 16); ADDOP("||", opLogOr, 15); ADDOPTERM("?", '?', 11, right, ':'); ADDOP1("=", opAssign, 10, right); ADDOP1("+=", opAddAssign, 10, right); ADDOP1("-=", opSubAssign, 10, right); ADDOP1("*=", opMulAssign, 10, right); ADDOP1("/=", opDivAssign, 10, right); ADDOP1("**=", opPowerAssign, 10, right); ADDOP(",", ',', 3); ADDKW("//", opEOLComment); ADDKW("/*", opSlashStarComment); ADDKW("let", opLet); ADDKW("in", opIn); ADDKW("end", opEnd); char_class = char_class_storage + 1; for (int i = -1; i < 256; i++) char_class[i] = 0; char *s; // Don't need to make s `unsigned char *' because it will only have // lower-ASCII characters in it! for (s = " \t\r\n\f\v"; *s; s++) char_class[*s] |= Whitespace; for (s = "L\'\""; *s; s++) char_class[*s] |= Literalstart; for (s = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_$"; *s; s++) char_class[*s] |= Wordstart | Word; for (s = "0123456789"; *s; s++) char_class[*s] |= Word; for (s = "0123456789."; *s; s++) char_class[*s] |= Numberstart | Number; for (s = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; *s; s++) char_class[*s] |= Number; for (s = "!#%&()*+,-./:;<=>?@[\\]^`{|}~"; *s; s++) char_class[*s] |= Punct; // Make sure opNone works out OK. assert(opNone == 0 && char_class[opNone] == 0); } inline bool Tokenizer::tis_whitespace(int c) { return (char_class[c] & Whitespace) != 0; } inline bool Tokenizer::tis_word_start(int c) { return (char_class[c] & Wordstart) != 0; } inline bool Tokenizer::tis_word(int c) { return (char_class[c] & Word) != 0; } inline bool Tokenizer::tis_number_start(int c) { return (char_class[c] & Numberstart) != 0; } inline bool Tokenizer::tis_number(int c) { return (char_class[c] & Number) != 0; } inline bool Tokenizer::tis_literal_start(int c) { return (char_class[c] & Literalstart) != 0; } inline bool Tokenizer::tis_punct(int c) { return (char_class[c] & Punct) != 0; } ///////////////////// Tokenizer::Tokenizer(TokenBufferFiller *tb_filler) : _tb(&_real_tb), _tb_filler(tb_filler), _s(new char[256]), _slen(0), _scap(256), _column(0), _line_non_ws(0) { static_initialize(); _unread_tb.make_empty(UNREAD_TB_SIZE); _unread_tb.pos = _unread_tb.end; _analyze_lines = _tb_filler->analyze_lines(); _tb_filler->fill_token_buffer(_tb); if (_analyze_lines) analyze_lines(_tb); } Tokenizer::~Tokenizer() { delete[] _s; } void Tokenizer::analyze_lines(TokenBuffer *tb) { const unsigned char *init = tb->pos; Landmark lm = landmark(); LandmarkMap *map = lm.landmark_map(); unsigned cp_pos = lm.cp(); unsigned cp_len = lm.cp() + tb->length(); for (const unsigned char *u = init; cp_pos < cp_len; u++, cp_pos++) if (*u == '\n') map->finish_line(cp_pos + 1); else if (*u == '\t') map->mark_tab(cp_pos); } void Tokenizer::change_future_lines(PermString file, unsigned line) { Landmark lm = landmark(); LandmarkMap *map = lm.landmark_map(); map->change_lines(lm.cp(), file, line); } int Tokenizer::fill_tb() { if (_tb == &_unread_tb) { _tb = &_real_tb; return read(); } else if (_tb->null()) return -1; else { _tb_filler->fill_token_buffer(_tb); if (_analyze_lines) analyze_lines(_tb); return read(); } } inline int Tokenizer::read() { if (_tb->empty()) return fill_tb(); else { int c = *_tb->pos; _tb->pos++; if (c == '\n') _column = _line_non_ws = 0; else _column++; return c; } } void Tokenizer::unread(int c) { if (c < 0) return; if (_tb->pos > _tb->buffer) { _tb->pos--; assert(*_tb->pos == c); } else { if (_tb != &_unread_tb) { _unread_tb._landmark = landmark(); _tb = &_unread_tb; } assert(_tb->pos + UNREAD_TB_SIZE > _tb->end); --_tb->pos; --_tb->_landmark; unsigned char *pos = (unsigned char *)_tb->pos; *pos = c; } } void Tokenizer::increase_s() { char *new_s = new char[_scap * 2]; memcpy(new_s, _s, _slen); delete[] _s; _s = new_s; _scap *= 2; } inline void Tokenizer::append0(int c) { _s[_slen++] = c; } inline void Tokenizer::append(int c) { if (_slen >= _scap) increase_s(); _s[_slen++] = c; } inline void Tokenizer::clear() { _slen = 0; } Operator Tokenizer::read_literal(int c) { // Accept, but ignore, an "L" denoting a long string constant. if (c == 'L') { c = read(); if (c != '\'' && c != '\"') { unread(c); return 0; } } int terminator = c; int ischar = (c == '\''); while (1) { c = read(); if (c == '\n') { warning(*this, "ANSI C forbids newline in %s constant", ischar ? "character" : "string"); // But fall through to the end, where we append the \n to the string. } else if (c == EOF) { error(*this, "unexpected end of file"); return opString; } else if (c == terminator) { return opString; } else if (c == '\\') { c = read(); switch (c) { case '\n': // \(newline) contracts to nothing. continue; case '\'': case '\"': case '\\': case '?': break; case 'a': c = '\a'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case 'x': { int i = read(); c = 0; while (1) { if (i >= '0' && i <= '9') c = c * 16 + i - '0'; else if (i >= 'a' && i <= 'f') c = c * 16 + i - 'a' + 10; else if (i >= 'A' && i <= 'F') c = c * 16 + i - 'A' + 10; else break; } break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { int i = c; c = 0; for (int j = 0; j < 3 && i >= '0' && i <= '7'; j++, i = read()) c = c * 8 + i - '0'; unread(i); break; } default: warning(*this, "unknown escape sequence"); break; } } append(c); } } Operator Tokenizer::read_word(int c) { append(c); while (1) { c = read(); if (tis_word(c)) { append(c); } else { unread(c); return opIdentifier; } } } Operator Tokenizer::read_number(int c) { // Returns a preprocessing number. if (c == '.') { c = read(); if (!isdigit(c)) { unread(c); return 0; } else append('.'); } append(c); for (c = read(); tis_number(c); c = read()) { append(c); if (c == 'e' || c == 'E') { c = read(); if (c == '+' || c == '-') append(c); else unread(c); } } unread(c); return opNumber; } Operator Tokenizer::read_punct(int c) { int d; assert(_slen + 3 < _scap); append0(c); // Below, `$' stands for the input character. switch (c) { default: // illegal character return 0; case '~': case '(': case ')': case ',': case ';': case '?': case '{': case '}': case '[': case ']': case ':': case '\n': // \n is made into a Punct when processing #-directives. // Always single-character tokens break; case '!': case '^': case '%': // Either `$' or `$='. goto maybe_equal; case '<': // Either `<', `<=', `<<', `<<=', or `<->'. d = read(); if (d == '-') { int e = read(); if (e == '>') { append0(d); append0(e); } else { unread(e); unread(d); } break; } goto check_relational; case '>': // Either `>', `>=', `>>', or `>>='. d = read(); goto check_relational; check_relational: // Either `$', `$$', `$=', or `$$=' // (the relational and shift operators) if (d == c) append0(d); else unread(d); goto maybe_equal; maybe_equal: // Check for an equal sign c = read(); if (c == '=') append0(c); else unread(c); break; case '|': // Either `$', `$=', `$$', or `$$$' d = read(); if (d == c) { append0(d); d = read(); if (d == c) append0(d); else unread(d); } else if (d == '=') append0(d); else unread(d); break; case '&': case '+': case '*': // Either `$', `$=', or `$$' d = read(); if (d == c || d == '=') append0(d); else unread(d); break; case '#': // Either `$' or `$$' d = read(); if (d == c) append0(d); else unread(d); break; case '-': // Special case: Either `-', `--', `-=', or `->' c = read(); if (c == '-' || c == '=' || c == '>') append0(c); else unread(c); break; case '=': // Special case: Either `=', `==', or `==>' c = read(); if (c == '=') { append0(c); c = read(); if (c == '>') { append0(c); break; } } unread(c); break; case '/': // Special case: either `/', `//', `/*', or `/=' c = read(); if (c == '/' || c == '*' || c == '=') append0(c); else unread(c); break; case '.': // Only two forms allowed: `.' and `...'. Two dots are expanded to three. c = read(); if (c == '.') { append0(c); append0(c); c = read(); if (c != '.') // FIXME report error? unread(c); } else unread(c); break; } // The punctuation token is in s. return opPunctuation; } bool Tokenizer::handle_hash() { return 0; //if (_line_non_ws != 1) //return 0; // deal now with `#'. // Make \n a special token for end-of-line; it won't be eaten as space. //char_class['\n'] = Punct; // Now read either `# NUMBER ["string"]' or `# line NUMBER ["string"]'. /* Token t = get_token(); IntExpr *ie; PermString new_file; unsigned new_line; if (t.is(opIdentifier)) { if (t.vstring() != "line") goto error; t = get_token(); } if (!t.is(opExpr)) goto error; ie = t.vexpr()->cast_int(); if (!ie) goto error; new_line = ie->value(); t = get_token(); // FIXME maybe? move to a string-typed literal? if (t.is(opString)) { new_file = t.vstring(); t = get_token(); } await_nl: while (t.kind() != '\n' && t) t = get_token(); if (_analyze_lines) change_future_lines(new_file, new_line); // return the binding of '\n' to normal. char_class['\n'] = Whitespace; // reset _line_non_ws to 0, since we just finished a line. If we don't do // this, we can't parse two adjacent #line directives. _line_non_ws = 0; return 1; error: error(*this, "unknown # directive"); goto await_nl;*/ } Token Tokenizer::handle_number(const Landmark &start_landmark, unsigned landmark_length) { bool force_int = false; bool force_real = false; bool force_hex = _s[0] == '0' && (_s[1] == 'x' || _s[1] == 'X'); bool is_unsigned = false; bool is_long = false; bool is_float = false; // handle suffixes for (; _slen >= 0; _slen--) switch (_s[_slen - 1]) { case 'L': case 'l': if (is_long || force_real) warning(*this, "garbage at end of number"); is_long = true; break; case 'U': case 'u': if (is_unsigned || force_real) warning(*this, "garbage at end of number"); warning(*this, "unsigned not yet supported"); force_int = is_unsigned = true; break; case 'F': case 'f': if (force_hex) goto main; if (force_int) warning(*this, "garbage at end of number"); force_real = is_float = true; break; default: goto main; } main: char *rest; _s[_slen] = 0; errno = 0; double val = strtol(_s, &rest, 0); if (errno == ERANGE) warning(*this, "integer constant out of range"); if (force_real || *rest == '.' || *rest == 'e' || *rest == 'E') val = strtod(_s, &rest); else if (*rest) error(*this, "floating constant or garbage at end of number"); // FIXME long? type? return Token(new ConstExpr(val), start_landmark, landmark_length); } void Tokenizer::check_s_for_hash() { char *s = _s; while (isspace(*s)) s++; if (*s++ != '#') return; while (isspace(*s)) s++; if (s[0] == 'l' && s[1] == 'i' && s[2] == 'n' && s[3] == 'e') s += 4; while (isspace(*s)) s++; if (!isdigit(*s)) return; // Don't subtract 1 because we've already read the `\n' which terminates // this line; we're effectively, therefore, on the next line. PermString new_file; unsigned new_line = strtol(s, &s, 10); while (isspace(*s)) s++; if (*s++ == '\"') { char *start = s; // FIXME backslashes in filenames while (*s && *s != '\"') s++; new_file = PermString(start, s - start); } if (_analyze_lines) change_future_lines(new_file, new_line); } Token Tokenizer::get_token() { while (1) { clear(); int c = read(); while (tis_whitespace(c)) c = read(); _line_non_ws++; Landmark start_landmark = landmark() - 1; Operator token_kind = 0; if (!token_kind && tis_literal_start(c)) token_kind = read_literal(c); if (!token_kind && tis_word_start(c)) token_kind = read_word(c); if (!token_kind && tis_number_start(c)) token_kind = read_number(c); if (!token_kind && tis_punct(c)) token_kind = read_punct(c); unsigned landmark_length = landmark().cp() - start_landmark.cp(); if (token_kind == 0) { if (c == EOF) return Token(0, "", start_landmark, landmark_length); else error(*this, "illegal character `%c'", c); continue; } PermString string(_s, _slen); if (token_kind == opIdentifier || token_kind == opPunctuation) { // Look up identifiers or punctuations in our reserved words hash table. Token *tokp = reserved_words.findp(string); if (tokp) { token_kind = tokp->kind(); // Check for assign variants: because `min=' won't actually be parsed // as a single token. if (tokp->is_operator() && tokp->voperator().assign_variant()) { c = read(); if (c == '=') { append(c); string = PermString(_s, _slen); tokp = reserved_words.findp(string); assert(tokp); } else unread(c); } /*if (tokp->kind() == opExpr) return Token(tokp->vexpr()->clone(landmark())); else*/ // early exit if we've found a real token if (!token_kind.tokenizer_internal()) return Token(*tokp, start_landmark, landmark_length); } else if (token_kind == opPunctuation) { // If we don't find a punctuation in the table, // it was a one-character punctuation, whose value is just that // character. assert(_slen == 1); token_kind = _s[0]; } } switch ((int)token_kind) { case opEOLComment: do { c = read(); } while (c != '\n' && c != EOF); break; case opSlashStarComment: do { while (c != '*' && c != EOF) c = read(); while (c == '*') c = read(); } while (c != EOF && c != '/'); break; case opNumber: return handle_number(start_landmark, landmark_length); case '#': if (!handle_hash()) return Token(token_kind, string, start_landmark, landmark_length); break; case opIdentifier: return Token(opIdentifier, string, start_landmark, landmark_length); default: return Token(token_kind, string, start_landmark, landmark_length); } } } // // TOKENBUFFER // TokenBuffer::TokenBuffer() : _free_buffer(0), buffer(0), end_buffer(0), pos(0), end(0) { } TokenBuffer::~TokenBuffer() { release(); } void TokenBuffer::delete_buffer(void *cv) { unsigned char *cuc = static_cast(cv); delete[] cuc; } void TokenBuffer::release() { if (_free_buffer && buffer) (*_free_buffer)(const_cast(buffer)); buffer = end_buffer = 0; pos = end = 0; } void TokenBuffer::reset(unsigned len, const Landmark &landmark) { pos = buffer; end = buffer + len; _landmark = landmark; } void TokenBuffer::reset(const char *str, unsigned len, const Landmark &landmark) { release(); buffer = reinterpret_cast(str); end_buffer = buffer + len; _free_buffer = 0; reset(len, landmark); } void TokenBuffer::make_empty(unsigned len) { release(); unsigned char *str = new unsigned char[len]; _free_buffer = delete_buffer; buffer = str; end_buffer = str + len; pos = end = buffer; } FileTokenBufferFiller::FileTokenBufferFiller(const char *name, FILE *f) : _f(f), _landmark(PermString(name)) { } void FileTokenBufferFiller::fill_token_buffer(TokenBuffer *tb) { if (feof(_f)) { tb->release(); return; } if (tb->null()) tb->make_empty(4096); unsigned char *buffer = const_cast(tb->buffer); unsigned l = fread(buffer, 1, tb->buffer_size(), _f); tb->reset(l, _landmark); _landmark += l; } StringTokenBufferFiller::StringTokenBufferFiller(const char *name, const char *data) : _data(data), _landmark(PermString(name)), _done(false) { } void StringTokenBufferFiller::fill_token_buffer(TokenBuffer *tb) { if (_done) tb->release(); else { tb->reset(_data, strlen(_data), _landmark); _done = true; } }