// lexer.cc see license.txt for copyright and terms of use // code for lexer.h #include "lexer.h" // this module #include "LangOptions.h" // LangOptions #include // isdigit #include // atoi using namespace sm; /* * Note about nonseparating tokens and the 'checkForNonsep' function: * * To diagnose and report erroneous syntax like "0x5g", which would * naively be parsed as "0x5" and "g" (two legal tokens), I divide * all tokens into two classes: separating and nonseparating. * * Separating tokens are allowed to be adjacent to each other and * to nonseparating tokens. An example is "(". * * Nonseparating tokens are not allowed to be adjacent to each other. * They must be separated by either whitespace, or at least one * separating token. The nonseparating tokens are identifiers, * alphabetic keywords, and literals. The lexer would of course never * yield two adjacent keywords, due to maximal munch, but classifying * such an event as an error is harmless. * * By keeping track of whether the last token yielded is separating or * not, we'll see (e.g.) "0x5g" as two consecutive nonseparating tokens, * and can report that as an error. * * The C++ standard is rather vague on this point as far as I can * tell. I haven't checked the C standard. In the C++ standard, * section 2.6 paragraph 1 states: * * "There are five kinds of tokens: identifiers, keywords, literals, * operators, and other separators. Blanks, horizontal and * vertical tabs, newlines, formfeeds, and comments (collectively, * "whitespace"), as described below, are ignored except as they * serve to separate tokens. [Note: Some white space is required * to separate otherwise adjacent identifiers, keywords, numeric * literals, and alternative tokens containing alphabetic * characters.]" * * The fact that the restriction is stated only in a parenthetical note * is of course nonideal. I think the qualifier "numeric" on "literals" * is a mistake, otherwise "a'b'" would be a legal token sequence. I * do not currently implement the "alternative tokens". * * Update: Mozilla includes things like "foo""bar", i.e. directly * adjacent string literals. Therefore I'm going to interpret (the * note in) the standard literally, and take char and string literals * to be separating. */ // -------------------- TokenType --------------------- // these aren't emitted into cc_tokens.cc because doing so would // make that output dependent on smbase/xassert.h char const *toString(TokenType type) { xassert(NUM_TOKEN_TYPES == tokenNameTableSize); xassert((unsigned)type < (unsigned)NUM_TOKEN_TYPES); return tokenNameTable[type]; } TokenFlag tokenFlags(TokenType type) { xassert((unsigned)type < (unsigned)NUM_TOKEN_TYPES); return (TokenFlag)tokenFlagTable[type]; } // ------------------------ OLexer ------------------- OLexer::OLexer(StringTable &s, const ellcc::LangOptions& LO, char const *fname) : BaseLexer(s, fname), prevIsNonsep(false), prevHashLineFile(s.add(fname)), currentMacro(NULL), LO(LO) { // prime this lexer with the first token getTokenFunc()(this); } OLexer::OLexer(StringTable &s, const ellcc::LangOptions& LO, SourceLocation initLoc, char const *buf, int len) : BaseLexer(s, initLoc, buf, len), prevIsNonsep(false), prevHashLineFile(s.add(sourceLocManager->getFile(initLoc))), currentMacro(NULL), LO(LO) { // do *not* prime the lexer; I think it is a mistake above, but // am leaving it for now } OLexer::~OLexer() {} void OLexer::whitespace() { BaseLexer::whitespace(); // various forms of whitespace can separate nonseparating tokens prevIsNonsep = false; } // this, and 'svalTok', are out of line because I don't want the // yylex() function to be enormous; I want that to just have a bunch // of calls into these routines, which themselves can then have // plenty of things inlined into them int OLexer::tok(TokenType t) { checkForNonsep(t); updLoc(); sval = NULL_SVAL; // catch mistaken uses of 'sval' for single-spelling tokens return t; } int OLexer::svalTok(TokenType t) { checkForNonsep(t); updLoc(); sval = (SemanticValue)addString(yytext, yyleng); return t; } int OLexer::alternateKeyword_tok(TokenType t) { if (LO.CPlusPlus) { return tok(t); } else { // in C mode, they are just identifiers return svalTok(TOK_NAME); } } // examples of recognized forms // #line 4 "foo.cc" // canonical form // # 4 "foo.cc" // "line" can be omitted // # 4 "foo.cc" 1 // extra stuff is ignored // # 4 // omitted filename means "same as previous" void OLexer::parseHashLine(char *directive, int len) { char *endp = directive+len; directive++; // skip "#" if (*directive == 'l') { directive += 4; // skip "line" } // skip whitespace while (*directive==' ' || *directive=='\t') { directive++; } // parse the line number if (!isdigit(*directive)) { pp_err("malformed #line directive line number"); return; } int lineNum = atoi(directive); // skip digits and whitespace while (isdigit(*directive)) { directive++; } while (*directive==' ' || *directive=='\t') { directive++; } if (*directive == '\n') { // no filename: use previous srcFile->addHashLine(curLine, lineNum, prevHashLineFile); return; } if (*directive != '\"') { pp_err("#line directive missing leading quote on filename"); return; } directive++; // look for trailing quote char *q = directive; while (qaddHashLine(curLine, lineNum, fname); // remember the filename for future #line directives that // don't explicitly include one prevHashLineFile = fname; } // preprocessing error: report the location information in the // preprocessed source, ignoring #line information void OLexer::pp_err(char const *msg) { // print only line information, and subtract one because I account // for whitespace (including the final newline) before processing it errors++; std::cerr << srcFile->name << ":" << (curLine-1) << ": error: " << msg << std::endl; } STATICDEF void OLexer::tokenFunc(LexerInterface *lex) { OLexer *ths = static_cast(lex); // call into the flex lexer; this updates 'loc' and sets // 'sval' as appropriate ths->type = ths->yylex(); } STATICDEF void OLexer::c_tokenFunc(LexerInterface *lex) { // as above OLexer *ths = static_cast(lex); ths->type = ths->yylex(); // map C++ keywords into identifiers TokenType tt = (TokenType)(ths->type); if (tokenFlags(tt) & TF_CPLUSPLUS) { // create the lexeme corresponding to the token's spelling StringRef str = ths->strtable.add(toString(tt)); // set the LexerInterface fields to yield the new token ths->type = TOK_NAME; ths->sval = (SemanticValue)str; } } OLexer::NextTokenFunc OLexer::getTokenFunc() const { if (LO.recognizeCppKeywords) { // expected case, yield the normal tokenizer return &OLexer::tokenFunc; } else { // yield the tokenizer that maps C++ keywords into C keywords return &OLexer::c_tokenFunc; } } string OLexer::tokenDesc() const { if (tokenFlags((TokenType)type) & TF_MULTISPELL) { // for tokens with multiple spellings, decode 'sval' as a // StringRef //return string((StringRef)sval); return stringc << toString((TokenType)type) << ": " << (StringRef)sval; } else { // for all others, consult the static table return string(toString((TokenType)type)); } } string OLexer::tokenKindDesc(int kind) const { // static table only return toString((TokenType)kind); } string OLexer::tokenKindDescV(int kind) const { stringBuilder s; s << toString((TokenType)kind) << " (" << kind << ")"; return s; } // parse line:col expressions static SourceLocation str2loc(char *str, char **endptr, char const * file) { int line = strtol(str, &str, 10); if (!line) return SL_UNKNOWN; str++; int col = strtol(str, &str, 10); if (endptr) *endptr = str; return sourceLocManager->encodeLineCol(file, line, col); } // comment of form /*useHashLines) return; StringRef name(NULL); SourceLocation preStartLoc(SL_UNKNOWN); SourceLocation preEndLoc(SL_UNKNOWN); bool isParam = false; if (char *spc = strchr(comment, ' ')) { char *in = spc + 1; name = addString(comment, (int)(spc - comment)); preStartLoc = str2loc(in, &in, prevHashLineFile); preEndLoc = str2loc(in + 1, NULL, prevHashLineFile); } else { name = addString(comment, len); isParam = strchr(name, ':') != NULL; if (isParam) { //this is a macro parameter MacroUndoEntry *parent = currentMacro; // get the parent that this macro param is defined in for (; strncmp(parent->name, name, strlen(parent->name)); parent = parent->parent) { } for (TailListIterNC it(parent->params); !it.isDone(); it.adv()) { MacroDefinition *def = it.data(); if (def->name != name) continue; preStartLoc = def->fromLoc; preEndLoc = def->toLoc; } } } MacroUndoEntry *current = new MacroUndoEntry(nextLoc, preStartLoc, preEndLoc, name, currentMacro); // add top-level and nested params to the list //if (!currentMacro || isParam) { macroUndoLog.append(current); //} currentMacro = current; } // m is only returned if it has a position void OLexer::addMacroDefinition(char *macro, int len, MacroDefinition **m) { SourceLocation fromLoc = SL_UNKNOWN; SourceLocation toLoc = SL_UNKNOWN; char *spc = strchr(macro, ' '); if (spc) { char *in = spc + 1; fromLoc = str2loc(in, &in, prevHashLineFile); toLoc = str2loc(in + 1, NULL, prevHashLineFile); if (m) { StringRef name = addString(macro, int(spc - macro)); *m = new MacroDefinition(name, fromLoc, toLoc); } } else { spc = macro + len; } } // comment of form /*useHashLines) return; addMacroDefinition(macro, len); } // comment of form /*useHashLines) return; MacroDefinition *param = NULL; addMacroDefinition(macro, len, ¶m); if (!currentMacro) { pp_err("Invalid macro parameter definition"); return; } // only record parameters with positions if (param) currentMacro->params.append(param); // macroParamDef always follows macroUndoStart // thus code doesn't start until the last // macroParamDef is done currentMacro->postStartLoc = nextLoc; } void OLexer::macroUndoStop() { SourceLocation postEndLoc = nextLoc; updLoc(); prevIsNonsep = false; if (!sourceLocManager->useHashLines) return; if (!currentMacro) { std::cerr << toString(postEndLoc) << ": Macro end tag without a start tag" << std::endl; exit(1); return; } currentMacro->postEndLoc = postEndLoc; currentMacro = currentMacro->parent; } // EOF