#include "Lexer.h" #include namespace Slang { static Token GetEndOfFileToken() { return Token(TokenType::EndOfFile, "", 0, 0, 0, ""); } Token* TokenList::begin() const { assert(mTokens.Count()); return &mTokens[0]; } Token* TokenList::end() const { assert(mTokens.Count()); assert(mTokens[mTokens.Count()-1].Type == TokenType::EndOfFile); return &mTokens[mTokens.Count() - 1]; } TokenSpan::TokenSpan() : mBegin(NULL) , mEnd (NULL) {} TokenReader::TokenReader() : mCursor(NULL) , mEnd (NULL) {} Token TokenReader::PeekToken() const { if (!mCursor) return GetEndOfFileToken(); Token token = *mCursor; if (mCursor == mEnd) token.Type = TokenType::EndOfFile; return token; } TokenType TokenReader::PeekTokenType() const { if (mCursor == mEnd) return TokenType::EndOfFile; assert(mCursor); return mCursor->Type; } CodePosition TokenReader::PeekLoc() const { if (!mCursor) return CodePosition(); assert(mCursor); return mCursor->Position; } Token TokenReader::AdvanceToken() { if (!mCursor) return GetEndOfFileToken(); Token token = *mCursor; if (mCursor == mEnd) token.Type = TokenType::EndOfFile; else mCursor++; return token; } // Lexer Lexer::Lexer( String const& path, String const& content, DiagnosticSink* sink) : path(path) , content(content) , sink(sink) { cursor = content.begin(); end = content.end(); loc = CodePosition(1, 1, 0, path); tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; lexerFlags = 0; } Lexer::~Lexer() { } enum { kEOF = -1 }; // Get the next input byte, without any handling of // escaped newlines, non-ASCII code points, source locations, etc. static int peekRaw(Lexer* lexer) { // If we are at the end of the input, return a designated end-of-file value if(lexer->cursor == lexer->end) return kEOF; // Otherwise, just look at the next byte return *lexer->cursor; } // Read one input byte without any special handling (similar to `peekRaw`) static int advanceRaw(Lexer* lexer) { // The logic here is basically the same as for `peekRaw()`, // escape we advance `cursor` if we aren't at the end. if (lexer->cursor == lexer->end) return kEOF; return *lexer->cursor++; } // When the cursor is already at the first byte of an end-of-line sequence, // consume one or two bytes that compose the sequence. // // Basically, a newline is one of: // // "\n" // "\r" // "\r\n" // "\n\r" // // We always look for the longest match possible. // static void handleNewLineInner(Lexer* lexer, int c) { assert(c == '\n' || c == '\r'); int d = peekRaw(lexer); if( (c ^ d) == ('\n' ^ '\r') ) { advanceRaw(lexer); } lexer->loc.Line++; lexer->loc.Col = 1; } // Look ahead one code point, dealing with complications like // escaped newlines. static int peek(Lexer* lexer) { // Look at the next raw byte, and decide what to do int c = peekRaw(lexer); if(c == '\\') { // We might have a backslash-escaped newline. // Look at the next byte (if any) to see. // // Note(tfoley): We are assuming a null-terminated input here, // so that we can safely look at the next byte without issue. int d = lexer->cursor[1]; switch (d) { case '\r': case '\n': { // The newline was escaped, so return the code point after *that* int e = lexer->cursor[2]; if ((d ^ e) == ('\r' ^ '\n')) return lexer->cursor[3]; return e; } default: break; } } // TODO: handle UTF-8 encoding for non-ASCII code points here // Default case is to just hand along the byte we read as an ASCII code point. return c; } // Get the next code point from the input, and advance the cursor. static int advance(Lexer* lexer) { // We are going to loop, but only as a way of handling // escaped line endings. for (;;) { // If we are at the end of the input, then the task is easy. if (lexer->cursor == lexer->end) return kEOF; // Look at the next raw byte, and decide what to do int c = *lexer->cursor++; if (c == '\\') { // We might have a backslash-escaped newline. // Look at the next byte (if any) to see. // // Note(tfoley): We are assuming a null-terminated input here, // so that we can safely look at the next byte without issue. int d = *lexer->cursor; switch (d) { case '\r': case '\n': // handle the end-of-line for our source location tracking lexer->cursor++; handleNewLineInner(lexer, d); // Now try again, looking at the character after the // escaped nmewline. continue; default: break; } } // TODO: Need to handle non-ASCII code points. // Default case is to advance by one location // and return the raw byte we saw. lexer->loc.Col++; lexer->loc.Pos++; return c; } } static void handleNewLine(Lexer* lexer) { int c = advance(lexer); handleNewLineInner(lexer, c); } static void lexLineComment(Lexer* lexer) { for(;;) { switch(peek(lexer)) { case '\n': case '\r': case kEOF: return; default: advance(lexer); continue; } } } static void lexBlockComment(Lexer* lexer) { for(;;) { switch(peek(lexer)) { case kEOF: // TODO(tfoley) diagnostic! return; case '\n': case '\r': handleNewLine(lexer); continue; case '*': advance(lexer); switch( peek(lexer) ) { case '/': advance(lexer); return; default: continue; } default: advance(lexer); continue; } } } static void lexHorizontalSpace(Lexer* lexer) { for(;;) { switch(peek(lexer)) { case ' ': case '\t': advance(lexer); continue; default: return; } } } static void lexIdentifier(Lexer* lexer) { for(;;) { int c = peek(lexer); if(('a' <= c ) && (c <= 'z') || ('A' <= c) && (c <= 'Z') || ('0' <= c) && (c <= '9') || (c == '_')) { advance(lexer); continue; } return; } } static void lexDigits(Lexer* lexer, int base) { for(;;) { int c = peek(lexer); int digitVal = 0; switch(c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': digitVal = c - '0'; break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': if(base <= 10) return; digitVal = 10 + c - 'a'; break; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': if(base <= 10) return; digitVal = 10 + c - 'A'; break; default: // Not more digits! return; } if(digitVal >= base) { char buffer[] = { (char) c, 0 }; lexer->sink->diagnose(lexer->loc, Diagnostics::invalidDigitForBase, buffer, base); } advance(lexer); } } static TokenType maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType) { // First check for suffixes that // indicate a floating-point number switch(peek(lexer)) { case 'f': case 'F': advance(lexer); return TokenType::DoubleLiterial; default: break; } // Once we've ruled out floating-point // suffixes, we can check for the inter cases // TODO: allow integer suffixes in any order... // Leading `u` or `U` for unsigned switch(peek(lexer)) { default: break; case 'u': case 'U': advance(lexer); break; } // Optional `l`, `L`, `ll`, or `LL` switch(peek(lexer)) { default: break; case 'l': case 'L': advance(lexer); switch(peek(lexer)) { default: break; case 'l': case 'L': advance(lexer); break; } break; } return tokenType; } static bool maybeLexNumberExponent(Lexer* lexer, int base) { switch( peek(lexer) ) { default: return false; case 'e': case 'E': if(base != 10) return false; advance(lexer); break; case 'p': case 'P': if(base != 16) return false; advance(lexer); break; } // we saw an exponent marker, so we must switch( peek(lexer) ) { case '+': case '-': advance(lexer); break; } // TODO(tfoley): it would be an error to not see digits here... lexDigits(lexer, 10); return true; } static TokenType lexNumberAfterDecimalPoint(Lexer* lexer, int base) { lexDigits(lexer, base); maybeLexNumberExponent(lexer, base); return maybeLexNumberSuffix(lexer, TokenType::DoubleLiterial); } static TokenType lexNumber(Lexer* lexer, int base) { // TODO(tfoley): Need to consider whehter to allow any kind of digit separator character. TokenType tokenType = TokenType::IntLiterial; // At the start of things, we just concern ourselves with digits lexDigits(lexer, base); if( peek(lexer) == '.' ) { tokenType = TokenType::DoubleLiterial; advance(lexer); lexDigits(lexer, base); } if( maybeLexNumberExponent(lexer, base)) { tokenType = TokenType::DoubleLiterial; } maybeLexNumberSuffix(lexer, tokenType); return tokenType; } static void lexStringLiteralBody(Lexer* lexer, char quote) { for(;;) { int c = peek(lexer); if(c == quote) { advance(lexer); return; } switch(c) { case kEOF: lexer->sink->diagnose(lexer->loc, Diagnostics::endOfFileInLiteral); return; case '\n': case '\r': lexer->sink->diagnose(lexer->loc, Diagnostics::newlineInLiteral); return; case '\\': // Need to handle various escape sequence cases advance(lexer); switch(peek(lexer)) { case '\'': case '\"': case '\\': case '?': case 'a': case 'b': case 'f': case 'n': case 'r': case 't': case 'v': advance(lexer); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': // octal escape: up to 3 characters advance(lexer); for(int ii = 0; ii < 3; ++ii) { int d = peek(lexer); if(('0' <= d) && (d <= '7')) { advance(lexer); continue; } else { break; } } break; case 'x': // hexadecimal escape: any number of characters advance(lexer); for(;;) { int d = peek(lexer); if(('0' <= d) && (d <= '9') || ('a' <= d) && (d <= 'f') || ('A' <= d) && (d <= 'F')) { advance(lexer); continue; } else { break; } } break; // TODO: Unicode escape sequences } break; default: advance(lexer); continue; } } } String getStringLiteralTokenValue(Token const& token) { assert(token.Type == TokenType::StringLiterial || token.Type == TokenType::CharLiterial); char const* cursor = token.Content.begin(); char const* end = token.Content.end(); auto quote = *cursor++; assert(quote == '\'' || quote == '"'); StringBuilder valueBuilder; for(;;) { assert(cursor != end); auto c = *cursor++; // If we see a closing quote, then we are at the end of the string literal if(c == quote) { assert(cursor == end); return valueBuilder.ProduceString(); } // Charcters that don't being escape sequences are easy; // just append them to the buffer and move on. if(c != '\\') { valueBuilder.Append(c); continue; } // Now we look at another character to figure out the kind of // escape sequence we are dealing with: int d = *cursor++; switch(d) { // Simple characters that just needed to be escaped case '\'': case '\"': case '\\': case '?': valueBuilder.Append(d); continue; // Traditional escape sequences for special characters case 'a': valueBuilder.Append('\a'); continue; case 'b': valueBuilder.Append('\b'); continue; case 'f': valueBuilder.Append('\f'); continue; case 'n': valueBuilder.Append('\n'); continue; case 'r': valueBuilder.Append('\r'); continue; case 't': valueBuilder.Append('\t'); continue; case 'v': valueBuilder.Append('\v'); continue; // Octal escape: up to 3 characterws case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { cursor--; int value = 0; for(int ii = 0; ii < 3; ++ii) { d = *cursor; if(('0' <= d) && (d <= '7')) { value = value*8 + (d - '0'); cursor++; continue; } else { break; } } // TODO: add support for appending an arbitrary code point? valueBuilder.Append((char) value); } continue; // Hexadecimal escape: any number of characters case 'x': { cursor--; int value = 0; for(;;) { d = *cursor++; int digitValue = 0; if(('0' <= d) && (d <= '9')) { digitValue = d - '0'; } else if( ('a' <= d) && (d <= 'f') ) { digitValue = d - 'a'; } else if( ('A' <= d) && (d <= 'F') ) { digitValue = d - 'A'; } else { cursor--; break; } value = value*16 + digitValue; } // TODO: add support for appending an arbitrary code point? valueBuilder.Append((char) value); } continue; // TODO: Unicode escape sequences } } } String getFileNameTokenValue(Token const& token) { // A file name usually doesn't process escape sequences // (this is import on Windows, where `\\` is a valid // path separator cahracter). // Just trim off the first and last characters to remove the quotes // (whether they were `""` or `<>`. return token.Content.SubString(1, token.Content.Length()-2); } static TokenType lexTokenImpl(Lexer* lexer) { switch(peek(lexer)) { default: break; case kEOF: if((lexer->lexerFlags & kLexerFlag_InDirective) != 0) return TokenType::EndOfDirective; return TokenType::EndOfFile; case '\r': case '\n': if((lexer->lexerFlags & kLexerFlag_InDirective) != 0) return TokenType::EndOfDirective; handleNewLine(lexer); return TokenType::NewLine; case ' ': case '\t': lexHorizontalSpace(lexer); return TokenType::WhiteSpace; case '.': advance(lexer); switch(peek(lexer)) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumberAfterDecimalPoint(lexer, 10); // TODO(tfoley): handle ellipsis (`...`) default: return TokenType::Dot; } case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumber(lexer, 10); case '0': { auto loc = lexer->loc; advance(lexer); switch(peek(lexer)) { default: return TokenType::IntLiterial; case '.': advance(lexer); return lexNumberAfterDecimalPoint(lexer, 10); case 'x': case 'X': advance(lexer); return lexNumber(lexer, 16); case 'b': case 'B': advance(lexer); return lexNumber(lexer, 2); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': lexer->sink->diagnose(loc, Diagnostics::octalLiteral); return lexNumber(lexer, 8); } } case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': lexIdentifier(lexer); return TokenType::Identifier; case '\"': advance(lexer); lexStringLiteralBody(lexer, '\"'); return TokenType::StringLiterial; case '\'': advance(lexer); lexStringLiteralBody(lexer, '\''); return TokenType::CharLiterial; case '+': advance(lexer); switch(peek(lexer)) { case '+': advance(lexer); return TokenType::OpInc; case '=': advance(lexer); return TokenType::OpAddAssign; default: return TokenType::OpAdd; } case '-': advance(lexer); switch(peek(lexer)) { case '-': advance(lexer); return TokenType::OpDec; case '=': advance(lexer); return TokenType::OpSubAssign; case '>': advance(lexer); return TokenType::RightArrow; default: return TokenType::OpSub; } case '*': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpMulAssign; default: return TokenType::OpMul; } case '/': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpDivAssign; case '/': advance(lexer); lexLineComment(lexer); return TokenType::LineComment; case '*': advance(lexer); lexBlockComment(lexer); return TokenType::BlockComment; default: return TokenType::OpDiv; } case '%': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpModAssign; default: return TokenType::OpMod; } case '|': advance(lexer); switch(peek(lexer)) { case '|': advance(lexer); return TokenType::OpOr; case '=': advance(lexer); return TokenType::OpOrAssign; default: return TokenType::OpBitOr; } case '&': advance(lexer); switch(peek(lexer)) { case '&': advance(lexer); return TokenType::OpAnd; case '=': advance(lexer); return TokenType::OpAndAssign; default: return TokenType::OpBitAnd; } case '^': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpXorAssign; default: return TokenType::OpBitXor; } case '>': advance(lexer); switch(peek(lexer)) { case '>': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpShrAssign; default: return TokenType::OpRsh; } case '=': advance(lexer); return TokenType::OpGeq; default: return TokenType::OpGreater; } case '<': advance(lexer); switch(peek(lexer)) { case '<': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpShlAssign; default: return TokenType::OpLsh; } case '=': advance(lexer); return TokenType::OpLeq; default: return TokenType::OpLess; } case '=': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpEql; default: return TokenType::OpAssign; } case '!': advance(lexer); switch(peek(lexer)) { case '=': advance(lexer); return TokenType::OpNeq; default: return TokenType::OpNot; } case '#': advance(lexer); switch(peek(lexer)) { case '#': advance(lexer); return TokenType::PoundPound; default: return TokenType::Pound; } case '~': advance(lexer); return TokenType::OpBitNot; case ':': advance(lexer); return TokenType::Colon; case ';': advance(lexer); return TokenType::Semicolon; case ',': advance(lexer); return TokenType::Comma; case '{': advance(lexer); return TokenType::LBrace; case '}': advance(lexer); return TokenType::RBrace; case '[': advance(lexer); return TokenType::LBracket; case ']': advance(lexer); return TokenType::RBracket; case '(': advance(lexer); return TokenType::LParent; case ')': advance(lexer); return TokenType::RParent; case '?': advance(lexer); return TokenType::QuestionMark; case '@': advance(lexer); return TokenType::At; case '$': advance(lexer); return TokenType::Dollar; } // TODO(tfoley): If we ever wanted to support proper Unicode // in identifiers, etc., then this would be the right place // to perform a more expensive dispatch based on the actual // code point (and not just the first byte). { // If none of the above cases matched, then we have an // unexpected/invalid character. auto loc = lexer->loc; auto sink = lexer->sink; int c = advance(lexer); if(c >= 0x20 && c <= 0x7E) { char buffer[] = { (char) c, 0 }; sink->diagnose(loc, Diagnostics::illegalCharacterPrint, buffer); } else { // Fallback: print as hexadecimal sink->diagnose(loc, Diagnostics::illegalCharacterHex, String((unsigned char)c, 16)); } return TokenType::Invalid; } } Token Lexer::lexToken() { auto flags = this->tokenFlags; for(;;) { Token token; token.Position = loc; char const* textBegin = cursor; auto tokenType = lexTokenImpl(this); // The low-level lexer produces tokens for things we want // to ignore, such as white space, so we skip them here. switch(tokenType) { case TokenType::Invalid: flags = 0; continue; case TokenType::NewLine: flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; continue; case TokenType::WhiteSpace: case TokenType::LineComment: case TokenType::BlockComment: flags |= TokenFlag::AfterWhitespace; continue; // We don't want to skip the end-of-file token, but we *do* // want to make sure it has appropriate flags to make our life easier case TokenType::EndOfFile: flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; break; // We will also do some book-keeping around preprocessor directives here: // // If we see a `#` at the start of a line, then we are entering a // preprocessor directive. case TokenType::Pound: if((flags & TokenFlag::AtStartOfLine) != 0) lexerFlags |= kLexerFlag_InDirective; break; // // And if we saw an end-of-line during a directive, then we are // now leaving that directive. // case TokenType::EndOfDirective: lexerFlags &= ~kLexerFlag_InDirective; break; default: break; } token.Type = tokenType; char const* textEnd = cursor; // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes if(textEnd != textBegin) { // HACK(tfoley): "scrubbing" token value here to remove escaped newlines... // // TODO: Only perform this work if we encountered an escaped newline // while lexing this token (e.g., keep a flag on the lexer), or // do it on-demand when the actual value of the token is needed. StringBuilder valueBuilder; auto tt = textBegin; while(tt != textEnd) { char c = *tt++; if(c == '\\') { char d = *tt; switch(d) { case '\r': case '\n': { tt++; char e = *tt; if((d ^ e) == ('\r' ^ '\n')) { tt++; } } continue; default: break; } } valueBuilder.Append(c); } token.Content = valueBuilder.ProduceString(); } token.flags = flags; this->tokenFlags = 0; return token; } } TokenList Lexer::lexAllTokens() { TokenList tokenList; for(;;) { Token token = lexToken(); tokenList.mTokens.Add(token); if(token.Type == TokenType::EndOfFile) return tokenList; } } #if 0 TokenList Lexer::Parse(const String & fileName, const String & str, DiagnosticSink * sink) { TokenList tokenList; tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, CodePosition pos) { auto curChar = str[pos.Pos]; switch (errType) { case TokenizeErrorType::InvalidCharacter: // Check if inside the ASCII "printable" range if(curChar >= 0x20 && curChar <= 0x7E) { char buffer[] = { curChar, 0 }; sink->diagnose(pos, Diagnostics::illegalCharacterPrint, buffer); } else { // Fallback: print as hexadecimal sink->diagnose(pos, Diagnostics::illegalCharacterHex, String((unsigned char)curChar, 16)); } break; case TokenizeErrorType::InvalidEscapeSequence: sink->diagnose(pos, Diagnostics::illegalCharacterLiteral); break; default: break; } }); // Add an end-of-file token so that we can reference it in diagnostic messages tokenList.mTokens.Add(Token(TokenType::EndOfFile, "", 0, 0, 0, fileName, TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace)); return tokenList; } #endif }