diff options
Diffstat (limited to 'source/slang/lexer.cpp')
| -rw-r--r-- | source/slang/lexer.cpp | 1711 |
1 files changed, 854 insertions, 857 deletions
diff --git a/source/slang/lexer.cpp b/source/slang/lexer.cpp index 87b3eaf63..cb718b538 100644 --- a/source/slang/lexer.cpp +++ b/source/slang/lexer.cpp @@ -4,397 +4,402 @@ namespace Slang { - namespace Compiler + static Token GetEndOfFileToken() { - static Token GetEndOfFileToken() - { - return Token(TokenType::EndOfFile, "", 0, 0, 0, ""); - } + return Token(TokenType::EndOfFile, "", 0, 0, 0, ""); + } - Token* TokenList::begin() const - { - assert(mTokens.Count()); - return &mTokens[0]; - } + Token* TokenList::begin() const + { + assert(mTokens.Count()); + return &mTokens[0]; + } - Token* TokenList::end() const - { - assert(mTokens.Count()); - assert(mTokens[mTokens.Count()-1].Type == TokenType::EndOfFile); - return &mTokens[mTokens.Count() - 1]; - } + Token* TokenList::end() const + { + assert(mTokens.Count()); + assert(mTokens[mTokens.Count()-1].Type == TokenType::EndOfFile); + return &mTokens[mTokens.Count() - 1]; + } - TokenSpan::TokenSpan() - : mBegin(NULL) - , mEnd (NULL) - {} + TokenSpan::TokenSpan() + : mBegin(NULL) + , mEnd (NULL) + {} - TokenReader::TokenReader() - : mCursor(NULL) - , mEnd (NULL) - {} + TokenReader::TokenReader() + : mCursor(NULL) + , mEnd (NULL) + {} - Token TokenReader::PeekToken() const - { - if (!mCursor) - return GetEndOfFileToken(); + Token TokenReader::PeekToken() const + { + if (!mCursor) + return GetEndOfFileToken(); - Token token = *mCursor; - if (mCursor == mEnd) - token.Type = TokenType::EndOfFile; - return token; - } + Token token = *mCursor; + if (mCursor == mEnd) + token.Type = TokenType::EndOfFile; + return token; + } - TokenType TokenReader::PeekTokenType() const - { - if (mCursor == mEnd) - return TokenType::EndOfFile; - assert(mCursor); - return mCursor->Type; - } + TokenType TokenReader::PeekTokenType() const + { + if (mCursor == mEnd) + return TokenType::EndOfFile; + assert(mCursor); + return mCursor->Type; + } - CodePosition TokenReader::PeekLoc() const - { - if (!mCursor) - return CodePosition(); - assert(mCursor); - return mCursor->Position; - } + CodePosition TokenReader::PeekLoc() const + { + if (!mCursor) + return CodePosition(); + assert(mCursor); + return mCursor->Position; + } - Token TokenReader::AdvanceToken() - { - if (!mCursor) - return GetEndOfFileToken(); + Token TokenReader::AdvanceToken() + { + if (!mCursor) + return GetEndOfFileToken(); + + Token token = *mCursor; + if (mCursor == mEnd) + token.Type = TokenType::EndOfFile; + else + mCursor++; + return token; + } - Token token = *mCursor; - if (mCursor == mEnd) - token.Type = TokenType::EndOfFile; - else - mCursor++; - return token; - } + // Lexer - // Lexer + Lexer::Lexer( + String const& path, + String const& content, + DiagnosticSink* sink) + : path(path) + , content(content) + , sink(sink) + { + cursor = content.begin(); + end = content.end(); - Lexer::Lexer( - String const& path, - String const& content, - DiagnosticSink* sink) - : path(path) - , content(content) - , sink(sink) - { - cursor = content.begin(); - end = content.end(); + loc = CodePosition(1, 1, 0, path); + tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + lexerFlags = 0; + } - loc = CodePosition(1, 1, 0, path); - tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; - lexerFlags = 0; - } + Lexer::~Lexer() + { + } - Lexer::~Lexer() - { - } + enum { kEOF = -1 }; - enum { kEOF = -1 }; + // Get the next input byte, without any handling of + // escaped newlines, non-ASCII code points, source locations, etc. + static int peekRaw(Lexer* lexer) + { + // If we are at the end of the input, return a designated end-of-file value + if(lexer->cursor == lexer->end) + return kEOF; - // Get the next input byte, without any handling of - // escaped newlines, non-ASCII code points, source locations, etc. - static int peekRaw(Lexer* lexer) - { - // If we are at the end of the input, return a designated end-of-file value - if(lexer->cursor == lexer->end) - return kEOF; + // Otherwise, just look at the next byte + return *lexer->cursor; + } - // Otherwise, just look at the next byte - return *lexer->cursor; - } + // Read one input byte without any special handling (similar to `peekRaw`) + static int advanceRaw(Lexer* lexer) + { + // The logic here is basically the same as for `peekRaw()`, + // escape we advance `cursor` if we aren't at the end. - // Read one input byte without any special handling (similar to `peekRaw`) - static int advanceRaw(Lexer* lexer) - { - // The logic here is basically the same as for `peekRaw()`, - // escape we advance `cursor` if we aren't at the end. + if (lexer->cursor == lexer->end) + return kEOF; - if (lexer->cursor == lexer->end) - return kEOF; + return *lexer->cursor++; + } - return *lexer->cursor++; - } + // When the cursor is already at the first byte of an end-of-line sequence, + // consume one or two bytes that compose the sequence. + // + // Basically, a newline is one of: + // + // "\n" + // "\r" + // "\r\n" + // "\n\r" + // + // We always look for the longest match possible. + // + static void handleNewLineInner(Lexer* lexer, int c) + { + assert(c == '\n' || c == '\r'); - // When the cursor is already at the first byte of an end-of-line sequence, - // consume one or two bytes that compose the sequence. - // - // Basically, a newline is one of: - // - // "\n" - // "\r" - // "\r\n" - // "\n\r" - // - // We always look for the longest match possible. - // - static void handleNewLineInner(Lexer* lexer, int c) + int d = peekRaw(lexer); + if( (c ^ d) == ('\n' ^ '\r') ) { - assert(c == '\n' || c == '\r'); + advanceRaw(lexer); + } + + lexer->loc.Line++; + lexer->loc.Col = 1; + } - int d = peekRaw(lexer); - if( (c ^ d) == ('\n' ^ '\r') ) + // Look ahead one code point, dealing with complications like + // escaped newlines. + static int peek(Lexer* lexer) + { + // Look at the next raw byte, and decide what to do + int c = peekRaw(lexer); + + if(c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = lexer->cursor[1]; + switch (d) { - advanceRaw(lexer); - } + case '\r': case '\n': + { + // The newline was escaped, so return the code point after *that* - lexer->loc.Line++; - lexer->loc.Col = 1; + int e = lexer->cursor[2]; + if ((d ^ e) == ('\r' ^ '\n')) + return lexer->cursor[3]; + return e; + } + + default: + break; + } } + // TODO: handle UTF-8 encoding for non-ASCII code points here + + // Default case is to just hand along the byte we read as an ASCII code point. + return c; + } - // Look ahead one code point, dealing with complications like - // escaped newlines. - static int peek(Lexer* lexer) + // Get the next code point from the input, and advance the cursor. + static int advance(Lexer* lexer) + { + // We are going to loop, but only as a way of handling + // escaped line endings. + for (;;) { + // If we are at the end of the input, then the task is easy. + if (lexer->cursor == lexer->end) + return kEOF; + // Look at the next raw byte, and decide what to do - int c = peekRaw(lexer); + int c = *lexer->cursor++; - if(c == '\\') + if (c == '\\') { // We might have a backslash-escaped newline. // Look at the next byte (if any) to see. // // Note(tfoley): We are assuming a null-terminated input here, // so that we can safely look at the next byte without issue. - int d = lexer->cursor[1]; + int d = *lexer->cursor; switch (d) { case '\r': case '\n': - { - // The newline was escaped, so return the code point after *that* + // handle the end-of-line for our source location tracking + lexer->cursor++; + handleNewLineInner(lexer, d); - int e = lexer->cursor[2]; - if ((d ^ e) == ('\r' ^ '\n')) - return lexer->cursor[3]; - return e; - } + // Now try again, looking at the character after the + // escaped nmewline. + continue; default: break; } } - // TODO: handle UTF-8 encoding for non-ASCII code points here - - // Default case is to just hand along the byte we read as an ASCII code point. - return c; - } - - // Get the next code point from the input, and advance the cursor. - static int advance(Lexer* lexer) - { - // We are going to loop, but only as a way of handling - // escaped line endings. - for (;;) - { - // If we are at the end of the input, then the task is easy. - if (lexer->cursor == lexer->end) - return kEOF; - - // Look at the next raw byte, and decide what to do - int c = *lexer->cursor++; - if (c == '\\') - { - // We might have a backslash-escaped newline. - // Look at the next byte (if any) to see. - // - // Note(tfoley): We are assuming a null-terminated input here, - // so that we can safely look at the next byte without issue. - int d = *lexer->cursor; - switch (d) - { - case '\r': case '\n': - // handle the end-of-line for our source location tracking - lexer->cursor++; - handleNewLineInner(lexer, d); - - // Now try again, looking at the character after the - // escaped nmewline. - continue; + // TODO: Need to handle non-ASCII code points. - default: - break; - } - } + // Default case is to advance by one location + // and return the raw byte we saw. - // TODO: Need to handle non-ASCII code points. + lexer->loc.Col++; + lexer->loc.Pos++; - // Default case is to advance by one location - // and return the raw byte we saw. - - lexer->loc.Col++; - lexer->loc.Pos++; - - return c; - } + return c; } + } - static void handleNewLine(Lexer* lexer) - { - int c = advance(lexer); - handleNewLineInner(lexer, c); - } + static void handleNewLine(Lexer* lexer) + { + int c = advance(lexer); + handleNewLineInner(lexer, c); + } - static void lexLineComment(Lexer* lexer) + static void lexLineComment(Lexer* lexer) + { + for(;;) { - for(;;) + switch(peek(lexer)) { - switch(peek(lexer)) - { - case '\n': case '\r': case kEOF: - return; + case '\n': case '\r': case kEOF: + return; - default: - advance(lexer); - continue; - } + default: + advance(lexer); + continue; } } + } - static void lexBlockComment(Lexer* lexer) + static void lexBlockComment(Lexer* lexer) + { + for(;;) { - for(;;) + switch(peek(lexer)) { - switch(peek(lexer)) - { - case kEOF: - // TODO(tfoley) diagnostic! - return; + case kEOF: + // TODO(tfoley) diagnostic! + return; - case '\n': case '\r': - handleNewLine(lexer); - continue; + case '\n': case '\r': + handleNewLine(lexer); + continue; - case '*': + case '*': + advance(lexer); + switch( peek(lexer) ) + { + case '/': advance(lexer); - switch( peek(lexer) ) - { - case '/': - advance(lexer); - return; - - default: - continue; - } + return; default: - advance(lexer); continue; } + + default: + advance(lexer); + continue; } } + } - static void lexHorizontalSpace(Lexer* lexer) + static void lexHorizontalSpace(Lexer* lexer) + { + for(;;) { - for(;;) + switch(peek(lexer)) { - switch(peek(lexer)) - { - case ' ': case '\t': - advance(lexer); - continue; + case ' ': case '\t': + advance(lexer); + continue; - default: - return; - } + default: + return; } } + } - static void lexIdentifier(Lexer* lexer) + static void lexIdentifier(Lexer* lexer) + { + for(;;) { - for(;;) + int c = peek(lexer); + if(('a' <= c ) && (c <= 'z') + || ('A' <= c) && (c <= 'Z') + || ('0' <= c) && (c <= '9') + || (c == '_')) { - int c = peek(lexer); - if(('a' <= c ) && (c <= 'z') - || ('A' <= c) && (c <= 'Z') - || ('0' <= c) && (c <= '9') - || (c == '_')) - { - advance(lexer); - continue; - } - - return; + advance(lexer); + continue; } + + return; } + } - static void lexDigits(Lexer* lexer, int base) + static void lexDigits(Lexer* lexer, int base) + { + for(;;) { - for(;;) - { - int c = peek(lexer); - - int digitVal = 0; - switch(c) - { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - digitVal = c - '0'; - break; + int c = peek(lexer); - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - if(base <= 10) return; - digitVal = 10 + c - 'a'; - break; + int digitVal = 0; + switch(c) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + digitVal = c - '0'; + break; - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - if(base <= 10) return; - digitVal = 10 + c - 'A'; - break; + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + if(base <= 10) return; + digitVal = 10 + c - 'a'; + break; - default: - // Not more digits! - return; - } + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + if(base <= 10) return; + digitVal = 10 + c - 'A'; + break; - if(digitVal >= base) - { - char buffer[] = { (char) c, 0 }; - lexer->sink->diagnose(lexer->loc, Diagnostics::invalidDigitForBase, buffer, base); - } + default: + // Not more digits! + return; + } - advance(lexer); + if(digitVal >= base) + { + char buffer[] = { (char) c, 0 }; + lexer->sink->diagnose(lexer->loc, Diagnostics::invalidDigitForBase, buffer, base); } + + advance(lexer); } + } - static TokenType maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType) + static TokenType maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType) + { + // First check for suffixes that + // indicate a floating-point number + switch(peek(lexer)) { - // First check for suffixes that - // indicate a floating-point number - switch(peek(lexer)) - { - case 'f': case 'F': - advance(lexer); - return TokenType::DoubleLiterial; + case 'f': case 'F': + advance(lexer); + return TokenType::DoubleLiterial; - default: - break; - } + default: + break; + } - // Once we've ruled out floating-point - // suffixes, we can check for the inter cases + // Once we've ruled out floating-point + // suffixes, we can check for the inter cases - // TODO: allow integer suffixes in any order... + // TODO: allow integer suffixes in any order... - // Leading `u` or `U` for unsigned - switch(peek(lexer)) - { - default: - break; + // Leading `u` or `U` for unsigned + switch(peek(lexer)) + { + default: + break; - case 'u': case 'U': - advance(lexer); - break; - } + case 'u': case 'U': + advance(lexer); + break; + } + + // Optional `l`, `L`, `ll`, or `LL` + switch(peek(lexer)) + { + default: + break; - // Optional `l`, `L`, `ll`, or `LL` + case 'l': case 'L': + advance(lexer); switch(peek(lexer)) { default: @@ -402,720 +407,712 @@ namespace Slang case 'l': case 'L': advance(lexer); - switch(peek(lexer)) - { - default: - break; - - case 'l': case 'L': - advance(lexer); - break; - } break; } + break; + } + + return tokenType; + } - return tokenType; + static bool maybeLexNumberExponent(Lexer* lexer, int base) + { + switch( peek(lexer) ) + { + default: + return false; + + case 'e': case 'E': + if(base != 10) return false; + advance(lexer); + break; + + case 'p': case 'P': + if(base != 16) return false; + advance(lexer); + break; } - static bool maybeLexNumberExponent(Lexer* lexer, int base) + // we saw an exponent marker, so we must + switch( peek(lexer) ) { - switch( peek(lexer) ) - { - default: - return false; + case '+': case '-': + advance(lexer); + break; + } - case 'e': case 'E': - if(base != 10) return false; - advance(lexer); - break; + // TODO(tfoley): it would be an error to not see digits here... - case 'p': case 'P': - if(base != 16) return false; - advance(lexer); - break; - } + lexDigits(lexer, 10); - // we saw an exponent marker, so we must - switch( peek(lexer) ) - { - case '+': case '-': - advance(lexer); - break; - } + return true; + } - // TODO(tfoley): it would be an error to not see digits here... + static TokenType lexNumberAfterDecimalPoint(Lexer* lexer, int base) + { + lexDigits(lexer, base); + maybeLexNumberExponent(lexer, base); + + return maybeLexNumberSuffix(lexer, TokenType::DoubleLiterial); + } + + static TokenType lexNumber(Lexer* lexer, int base) + { + // TODO(tfoley): Need to consider whehter to allow any kind of digit separator character. - lexDigits(lexer, 10); + TokenType tokenType = TokenType::IntLiterial; - return true; - } + // At the start of things, we just concern ourselves with digits + lexDigits(lexer, base); - static TokenType lexNumberAfterDecimalPoint(Lexer* lexer, int base) + if( peek(lexer) == '.' ) { + tokenType = TokenType::DoubleLiterial; + + advance(lexer); lexDigits(lexer, base); - maybeLexNumberExponent(lexer, base); - - return maybeLexNumberSuffix(lexer, TokenType::DoubleLiterial); } - static TokenType lexNumber(Lexer* lexer, int base) + if( maybeLexNumberExponent(lexer, base)) { - // TODO(tfoley): Need to consider whehter to allow any kind of digit separator character. - - TokenType tokenType = TokenType::IntLiterial; + tokenType = TokenType::DoubleLiterial; + } - // At the start of things, we just concern ourselves with digits - lexDigits(lexer, base); + maybeLexNumberSuffix(lexer, tokenType); + return tokenType; + } - if( peek(lexer) == '.' ) + static void lexStringLiteralBody(Lexer* lexer, char quote) + { + for(;;) + { + int c = peek(lexer); + if(c == quote) { - tokenType = TokenType::DoubleLiterial; - advance(lexer); - lexDigits(lexer, base); + return; } - if( maybeLexNumberExponent(lexer, base)) + switch(c) { - tokenType = TokenType::DoubleLiterial; - } + case kEOF: + lexer->sink->diagnose(lexer->loc, Diagnostics::endOfFileInLiteral); + return; - maybeLexNumberSuffix(lexer, tokenType); - return tokenType; - } + case '\n': case '\r': + lexer->sink->diagnose(lexer->loc, Diagnostics::newlineInLiteral); + return; - static void lexStringLiteralBody(Lexer* lexer, char quote) - { - for(;;) - { - int c = peek(lexer); - if(c == quote) + case '\\': + // Need to handle various escape sequence cases + advance(lexer); + switch(peek(lexer)) { + case '\'': + case '\"': + case '\\': + case '?': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': advance(lexer); - return; - } - - switch(c) - { - case kEOF: - lexer->sink->diagnose(lexer->loc, Diagnostics::endOfFileInLiteral); - return; - - case '\n': case '\r': - lexer->sink->diagnose(lexer->loc, Diagnostics::newlineInLiteral); - return; + break; - case '\\': - // Need to handle various escape sequence cases + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + // octal escape: up to 3 characters advance(lexer); - switch(peek(lexer)) + for(int ii = 0; ii < 3; ++ii) { - case '\'': - case '\"': - case '\\': - case '?': - case 'a': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - advance(lexer); - break; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': - // octal escape: up to 3 characters - advance(lexer); - for(int ii = 0; ii < 3; ++ii) + int d = peek(lexer); + if(('0' <= d) && (d <= '7')) { - int d = peek(lexer); - if(('0' <= d) && (d <= '7')) - { - advance(lexer); - continue; - } - else - { - break; - } + advance(lexer); + continue; } - break; - - case 'x': - // hexadecimal escape: any number of characters - advance(lexer); - for(;;) + else { - int d = peek(lexer); - if(('0' <= d) && (d <= '9') - || ('a' <= d) && (d <= 'f') - || ('A' <= d) && (d <= 'F')) - { - advance(lexer); - continue; - } - else - { - break; - } + break; } - break; - - // TODO: Unicode escape sequences - } break; - default: + case 'x': + // hexadecimal escape: any number of characters advance(lexer); - continue; + for(;;) + { + int d = peek(lexer); + if(('0' <= d) && (d <= '9') + || ('a' <= d) && (d <= 'f') + || ('A' <= d) && (d <= 'F')) + { + advance(lexer); + continue; + } + else + { + break; + } + } + break; + + // TODO: Unicode escape sequences + } + break; + + default: + advance(lexer); + continue; } } + } - String getStringLiteralTokenValue(Token const& token) - { - assert(token.Type == TokenType::StringLiterial - || token.Type == TokenType::CharLiterial); + String getStringLiteralTokenValue(Token const& token) + { + assert(token.Type == TokenType::StringLiterial + || token.Type == TokenType::CharLiterial); - char const* cursor = token.Content.begin(); - char const* end = token.Content.end(); + char const* cursor = token.Content.begin(); + char const* end = token.Content.end(); - auto quote = *cursor++; - assert(quote == '\'' || quote == '"'); + auto quote = *cursor++; + assert(quote == '\'' || quote == '"'); - StringBuilder valueBuilder; - for(;;) - { - assert(cursor != end); + StringBuilder valueBuilder; + for(;;) + { + assert(cursor != end); - auto c = *cursor++; + auto c = *cursor++; - // If we see a closing quote, then we are at the end of the string literal - if(c == quote) - { - assert(cursor == end); - return valueBuilder.ProduceString(); - } + // If we see a closing quote, then we are at the end of the string literal + if(c == quote) + { + assert(cursor == end); + return valueBuilder.ProduceString(); + } - // Charcters that don't being escape sequences are easy; - // just append them to the buffer and move on. - if(c != '\\') - { - valueBuilder.Append(c); - continue; - } + // Charcters that don't being escape sequences are easy; + // just append them to the buffer and move on. + if(c != '\\') + { + valueBuilder.Append(c); + continue; + } - // Now we look at another character to figure out the kind of - // escape sequence we are dealing with: + // Now we look at another character to figure out the kind of + // escape sequence we are dealing with: - int d = *cursor++; + int d = *cursor++; - switch(d) + switch(d) + { + // Simple characters that just needed to be escaped + case '\'': + case '\"': + case '\\': + case '?': + valueBuilder.Append(d); + continue; + + // Traditional escape sequences for special characters + case 'a': valueBuilder.Append('\a'); continue; + case 'b': valueBuilder.Append('\b'); continue; + case 'f': valueBuilder.Append('\f'); continue; + case 'n': valueBuilder.Append('\n'); continue; + case 'r': valueBuilder.Append('\r'); continue; + case 't': valueBuilder.Append('\t'); continue; + case 'v': valueBuilder.Append('\v'); continue; + + // Octal escape: up to 3 characterws + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': { - // Simple characters that just needed to be escaped - case '\'': - case '\"': - case '\\': - case '?': - valueBuilder.Append(d); - continue; - - // Traditional escape sequences for special characters - case 'a': valueBuilder.Append('\a'); continue; - case 'b': valueBuilder.Append('\b'); continue; - case 'f': valueBuilder.Append('\f'); continue; - case 'n': valueBuilder.Append('\n'); continue; - case 'r': valueBuilder.Append('\r'); continue; - case 't': valueBuilder.Append('\t'); continue; - case 'v': valueBuilder.Append('\v'); continue; - - // Octal escape: up to 3 characterws - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': + cursor--; + int value = 0; + for(int ii = 0; ii < 3; ++ii) { - cursor--; - int value = 0; - for(int ii = 0; ii < 3; ++ii) + d = *cursor; + if(('0' <= d) && (d <= '7')) { - d = *cursor; - if(('0' <= d) && (d <= '7')) - { - value = value*8 + (d - '0'); - - cursor++; - continue; - } - else - { - break; - } - } + value = value*8 + (d - '0'); - // TODO: add support for appending an arbitrary code point? - valueBuilder.Append((char) value); + cursor++; + continue; + } + else + { + break; + } } - continue; - // Hexadecimal escape: any number of characters - case 'x': + // TODO: add support for appending an arbitrary code point? + valueBuilder.Append((char) value); + } + continue; + + // Hexadecimal escape: any number of characters + case 'x': + { + cursor--; + int value = 0; + for(;;) { - cursor--; - int value = 0; - for(;;) + d = *cursor++; + int digitValue = 0; + if(('0' <= d) && (d <= '9')) + { + digitValue = d - '0'; + } + else if( ('a' <= d) && (d <= 'f') ) + { + digitValue = d - 'a'; + } + else if( ('A' <= d) && (d <= 'F') ) { - d = *cursor++; - int digitValue = 0; - if(('0' <= d) && (d <= '9')) - { - digitValue = d - '0'; - } - else if( ('a' <= d) && (d <= 'f') ) - { - digitValue = d - 'a'; - } - else if( ('A' <= d) && (d <= 'F') ) - { - digitValue = d - 'A'; - } - else - { - cursor--; - break; - } - - value = value*16 + digitValue; + digitValue = d - 'A'; + } + else + { + cursor--; + break; } - // TODO: add support for appending an arbitrary code point? - valueBuilder.Append((char) value); + value = value*16 + digitValue; } - continue; - - // TODO: Unicode escape sequences + // TODO: add support for appending an arbitrary code point? + valueBuilder.Append((char) value); } + continue; + + // TODO: Unicode escape sequences + } } + } - String getFileNameTokenValue(Token const& token) - { - // A file name usually doesn't process escape sequences - // (this is import on Windows, where `\\` is a valid - // path separator cahracter). + String getFileNameTokenValue(Token const& token) + { + // A file name usually doesn't process escape sequences + // (this is import on Windows, where `\\` is a valid + // path separator cahracter). - // Just trim off the first and last characters to remove the quotes - // (whether they were `""` or `<>`. - return token.Content.SubString(1, token.Content.Length()-2); - } + // Just trim off the first and last characters to remove the quotes + // (whether they were `""` or `<>`. + return token.Content.SubString(1, token.Content.Length()-2); + } - static TokenType lexTokenImpl(Lexer* lexer) + static TokenType lexTokenImpl(Lexer* lexer) + { + switch(peek(lexer)) { + default: + break; + + case kEOF: + if((lexer->lexerFlags & kLexerFlag_InDirective) != 0) + return TokenType::EndOfDirective; + return TokenType::EndOfFile; + + case '\r': case '\n': + if((lexer->lexerFlags & kLexerFlag_InDirective) != 0) + return TokenType::EndOfDirective; + handleNewLine(lexer); + return TokenType::NewLine; + + case ' ': case '\t': + lexHorizontalSpace(lexer); + return TokenType::WhiteSpace; + + case '.': + advance(lexer); switch(peek(lexer)) { - default: - break; + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return lexNumberAfterDecimalPoint(lexer, 10); - case kEOF: - if((lexer->lexerFlags & kLexerFlag_InDirective) != 0) - return TokenType::EndOfDirective; - return TokenType::EndOfFile; + // TODO(tfoley): handle ellipsis (`...`) - case '\r': case '\n': - if((lexer->lexerFlags & kLexerFlag_InDirective) != 0) - return TokenType::EndOfDirective; - handleNewLine(lexer); - return TokenType::NewLine; + default: + return TokenType::Dot; + } - case ' ': case '\t': - lexHorizontalSpace(lexer); - return TokenType::WhiteSpace; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return lexNumber(lexer, 10); - case '.': + case '0': + { + auto loc = lexer->loc; advance(lexer); switch(peek(lexer)) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return lexNumberAfterDecimalPoint(lexer, 10); - - // TODO(tfoley): handle ellipsis (`...`) - default: - return TokenType::Dot; - } + return TokenType::IntLiterial; - case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return lexNumber(lexer, 10); - - case '0': - { - auto loc = lexer->loc; + case '.': advance(lexer); - switch(peek(lexer)) - { - default: - return TokenType::IntLiterial; - - case '.': - advance(lexer); - return lexNumberAfterDecimalPoint(lexer, 10); + return lexNumberAfterDecimalPoint(lexer, 10); - case 'x': case 'X': - advance(lexer); - return lexNumber(lexer, 16); + case 'x': case 'X': + advance(lexer); + return lexNumber(lexer, 16); - case 'b': case 'B': - advance(lexer); - return lexNumber(lexer, 2); + case 'b': case 'B': + advance(lexer); + return lexNumber(lexer, 2); - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - lexer->sink->diagnose(loc, Diagnostics::octalLiteral); - return lexNumber(lexer, 8); - } + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + lexer->sink->diagnose(loc, Diagnostics::octalLiteral); + return lexNumber(lexer, 8); } + } - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': case 'g': case 'h': case 'i': case 'j': - case 'k': case 'l': case 'm': case 'n': case 'o': - case 'p': case 'q': case 'r': case 's': case 't': - case 'u': case 'v': case 'w': case 'x': case 'y': - case 'z': - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': case 'G': case 'H': case 'I': case 'J': - case 'K': case 'L': case 'M': case 'N': case 'O': - case 'P': case 'Q': case 'R': case 'S': case 'T': - case 'U': case 'V': case 'W': case 'X': case 'Y': - case 'Z': - case '_': - lexIdentifier(lexer); - return TokenType::Identifier; - - case '\"': - advance(lexer); - lexStringLiteralBody(lexer, '\"'); - return TokenType::StringLiterial; - - case '\'': - advance(lexer); - lexStringLiteralBody(lexer, '\''); - return TokenType::CharLiterial; - - case '+': - advance(lexer); - switch(peek(lexer)) - { - case '+': advance(lexer); return TokenType::OpInc; - case '=': advance(lexer); return TokenType::OpAddAssign; - default: - return TokenType::OpAdd; - } + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '_': + lexIdentifier(lexer); + return TokenType::Identifier; + + case '\"': + advance(lexer); + lexStringLiteralBody(lexer, '\"'); + return TokenType::StringLiterial; + + case '\'': + advance(lexer); + lexStringLiteralBody(lexer, '\''); + return TokenType::CharLiterial; + + case '+': + advance(lexer); + switch(peek(lexer)) + { + case '+': advance(lexer); return TokenType::OpInc; + case '=': advance(lexer); return TokenType::OpAddAssign; + default: + return TokenType::OpAdd; + } - case '-': - advance(lexer); - switch(peek(lexer)) - { - case '-': advance(lexer); return TokenType::OpDec; - case '=': advance(lexer); return TokenType::OpSubAssign; - case '>': advance(lexer); return TokenType::RightArrow; - default: - return TokenType::OpSub; - } + case '-': + advance(lexer); + switch(peek(lexer)) + { + case '-': advance(lexer); return TokenType::OpDec; + case '=': advance(lexer); return TokenType::OpSubAssign; + case '>': advance(lexer); return TokenType::RightArrow; + default: + return TokenType::OpSub; + } - case '*': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpMulAssign; - default: - return TokenType::OpMul; - } + case '*': + advance(lexer); + switch(peek(lexer)) + { + case '=': advance(lexer); return TokenType::OpMulAssign; + default: + return TokenType::OpMul; + } - case '/': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpDivAssign; - case '/': advance(lexer); lexLineComment(lexer); return TokenType::LineComment; - case '*': advance(lexer); lexBlockComment(lexer); return TokenType::BlockComment; - default: - return TokenType::OpDiv; - } + case '/': + advance(lexer); + switch(peek(lexer)) + { + case '=': advance(lexer); return TokenType::OpDivAssign; + case '/': advance(lexer); lexLineComment(lexer); return TokenType::LineComment; + case '*': advance(lexer); lexBlockComment(lexer); return TokenType::BlockComment; + default: + return TokenType::OpDiv; + } - case '%': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpModAssign; - default: - return TokenType::OpMod; - } + case '%': + advance(lexer); + switch(peek(lexer)) + { + case '=': advance(lexer); return TokenType::OpModAssign; + default: + return TokenType::OpMod; + } - case '|': - advance(lexer); - switch(peek(lexer)) - { - case '|': advance(lexer); return TokenType::OpOr; - case '=': advance(lexer); return TokenType::OpOrAssign; - default: - return TokenType::OpBitOr; - } + case '|': + advance(lexer); + switch(peek(lexer)) + { + case '|': advance(lexer); return TokenType::OpOr; + case '=': advance(lexer); return TokenType::OpOrAssign; + default: + return TokenType::OpBitOr; + } - case '&': - advance(lexer); - switch(peek(lexer)) - { - case '&': advance(lexer); return TokenType::OpAnd; - case '=': advance(lexer); return TokenType::OpAndAssign; - default: - return TokenType::OpBitAnd; - } + case '&': + advance(lexer); + switch(peek(lexer)) + { + case '&': advance(lexer); return TokenType::OpAnd; + case '=': advance(lexer); return TokenType::OpAndAssign; + default: + return TokenType::OpBitAnd; + } - case '^': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpXorAssign; - default: - return TokenType::OpBitXor; - } + case '^': + advance(lexer); + switch(peek(lexer)) + { + case '=': advance(lexer); return TokenType::OpXorAssign; + default: + return TokenType::OpBitXor; + } + case '>': + advance(lexer); + switch(peek(lexer)) + { case '>': advance(lexer); switch(peek(lexer)) { - case '>': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpShrAssign; - default: return TokenType::OpRsh; - } - case '=': advance(lexer); return TokenType::OpGeq; - default: - return TokenType::OpGreater; + case '=': advance(lexer); return TokenType::OpShrAssign; + default: return TokenType::OpRsh; } + case '=': advance(lexer); return TokenType::OpGeq; + default: + return TokenType::OpGreater; + } + case '<': + advance(lexer); + switch(peek(lexer)) + { case '<': advance(lexer); switch(peek(lexer)) { - case '<': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpShlAssign; - default: return TokenType::OpLsh; - } - case '=': advance(lexer); return TokenType::OpLeq; - default: - return TokenType::OpLess; - } - - case '=': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpEql; - default: - return TokenType::OpAssign; + case '=': advance(lexer); return TokenType::OpShlAssign; + default: return TokenType::OpLsh; } + case '=': advance(lexer); return TokenType::OpLeq; + default: + return TokenType::OpLess; + } - case '!': - advance(lexer); - switch(peek(lexer)) - { - case '=': advance(lexer); return TokenType::OpNeq; - default: - return TokenType::OpNot; - } + case '=': + advance(lexer); + switch(peek(lexer)) + { + case '=': advance(lexer); return TokenType::OpEql; + default: + return TokenType::OpAssign; + } - case '#': - advance(lexer); - switch(peek(lexer)) - { - case '#': advance(lexer); return TokenType::PoundPound; - default: - return TokenType::Pound; - } + case '!': + advance(lexer); + switch(peek(lexer)) + { + case '=': advance(lexer); return TokenType::OpNeq; + default: + return TokenType::OpNot; + } - case '~': advance(lexer); return TokenType::OpBitNot; + case '#': + advance(lexer); + switch(peek(lexer)) + { + case '#': advance(lexer); return TokenType::PoundPound; + default: + return TokenType::Pound; + } - case ':': advance(lexer); return TokenType::Colon; - case ';': advance(lexer); return TokenType::Semicolon; - case ',': advance(lexer); return TokenType::Comma; + case '~': advance(lexer); return TokenType::OpBitNot; - case '{': advance(lexer); return TokenType::LBrace; - case '}': advance(lexer); return TokenType::RBrace; - case '[': advance(lexer); return TokenType::LBracket; - case ']': advance(lexer); return TokenType::RBracket; - case '(': advance(lexer); return TokenType::LParent; - case ')': advance(lexer); return TokenType::RParent; + case ':': advance(lexer); return TokenType::Colon; + case ';': advance(lexer); return TokenType::Semicolon; + case ',': advance(lexer); return TokenType::Comma; - case '?': advance(lexer); return TokenType::QuestionMark; - case '@': advance(lexer); return TokenType::At; - case '$': advance(lexer); return TokenType::Dollar; + case '{': advance(lexer); return TokenType::LBrace; + case '}': advance(lexer); return TokenType::RBrace; + case '[': advance(lexer); return TokenType::LBracket; + case ']': advance(lexer); return TokenType::RBracket; + case '(': advance(lexer); return TokenType::LParent; + case ')': advance(lexer); return TokenType::RParent; - } + case '?': advance(lexer); return TokenType::QuestionMark; + case '@': advance(lexer); return TokenType::At; + case '$': advance(lexer); return TokenType::Dollar; - // TODO(tfoley): If we ever wanted to support proper Unicode - // in identifiers, etc., then this would be the right place - // to perform a more expensive dispatch based on the actual - // code point (and not just the first byte). + } - { - // If none of the above cases matched, then we have an - // unexpected/invalid character. + // TODO(tfoley): If we ever wanted to support proper Unicode + // in identifiers, etc., then this would be the right place + // to perform a more expensive dispatch based on the actual + // code point (and not just the first byte). - auto loc = lexer->loc; - auto sink = lexer->sink; - int c = advance(lexer); - if(c >= 0x20 && c <= 0x7E) - { - char buffer[] = { (char) c, 0 }; - sink->diagnose(loc, Diagnostics::illegalCharacterPrint, buffer); - } - else - { - // Fallback: print as hexadecimal - sink->diagnose(loc, Diagnostics::illegalCharacterHex, String((unsigned char)c, 16)); - } + { + // If none of the above cases matched, then we have an + // unexpected/invalid character. - return TokenType::Invalid; + auto loc = lexer->loc; + auto sink = lexer->sink; + int c = advance(lexer); + if(c >= 0x20 && c <= 0x7E) + { + char buffer[] = { (char) c, 0 }; + sink->diagnose(loc, Diagnostics::illegalCharacterPrint, buffer); } - } - - Token Lexer::lexToken() - { - auto flags = this->tokenFlags; - for(;;) + else { - Token token; - token.Position = loc; - - char const* textBegin = cursor; + // Fallback: print as hexadecimal + sink->diagnose(loc, Diagnostics::illegalCharacterHex, String((unsigned char)c, 16)); + } - auto tokenType = lexTokenImpl(this); + return TokenType::Invalid; + } + } - // The low-level lexer produces tokens for things we want - // to ignore, such as white space, so we skip them here. - switch(tokenType) - { - case TokenType::Invalid: - flags = 0; - continue; + Token Lexer::lexToken() + { + auto flags = this->tokenFlags; + for(;;) + { + Token token; + token.Position = loc; - case TokenType::NewLine: - flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; - continue; + char const* textBegin = cursor; - case TokenType::WhiteSpace: - case TokenType::LineComment: - case TokenType::BlockComment: - flags |= TokenFlag::AfterWhitespace; - continue; + auto tokenType = lexTokenImpl(this); - // We don't want to skip the end-of-file token, but we *do* - // want to make sure it has appropriate flags to make our life easier - case TokenType::EndOfFile: - flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; - break; + // The low-level lexer produces tokens for things we want + // to ignore, such as white space, so we skip them here. + switch(tokenType) + { + case TokenType::Invalid: + flags = 0; + continue; + + case TokenType::NewLine: + flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + continue; + + case TokenType::WhiteSpace: + case TokenType::LineComment: + case TokenType::BlockComment: + flags |= TokenFlag::AfterWhitespace; + continue; + + // We don't want to skip the end-of-file token, but we *do* + // want to make sure it has appropriate flags to make our life easier + case TokenType::EndOfFile: + flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + break; - // We will also do some book-keeping around preprocessor directives here: - // - // If we see a `#` at the start of a line, then we are entering a - // preprocessor directive. - case TokenType::Pound: - if((flags & TokenFlag::AtStartOfLine) != 0) - lexerFlags |= kLexerFlag_InDirective; - break; - // - // And if we saw an end-of-line during a directive, then we are - // now leaving that directive. - // - case TokenType::EndOfDirective: - lexerFlags &= ~kLexerFlag_InDirective; - break; + // We will also do some book-keeping around preprocessor directives here: + // + // If we see a `#` at the start of a line, then we are entering a + // preprocessor directive. + case TokenType::Pound: + if((flags & TokenFlag::AtStartOfLine) != 0) + lexerFlags |= kLexerFlag_InDirective; + break; + // + // And if we saw an end-of-line during a directive, then we are + // now leaving that directive. + // + case TokenType::EndOfDirective: + lexerFlags &= ~kLexerFlag_InDirective; + break; - default: - break; - } + default: + break; + } - token.Type = tokenType; + token.Type = tokenType; - char const* textEnd = cursor; + char const* textEnd = cursor; - // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes - if(textEnd != textBegin) - { - StringBuilder valueBuilder; - valueBuilder.Append(textBegin, int(textEnd - textBegin)); - token.Content = valueBuilder.ProduceString(); - } + // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes + if(textEnd != textBegin) + { + StringBuilder valueBuilder; + valueBuilder.Append(textBegin, int(textEnd - textBegin)); + token.Content = valueBuilder.ProduceString(); + } - token.flags = flags; + token.flags = flags; - this->tokenFlags = 0; + this->tokenFlags = 0; - return token; - } + return token; } + } - TokenList Lexer::lexAllTokens() + TokenList Lexer::lexAllTokens() + { + TokenList tokenList; + for(;;) { - TokenList tokenList; - for(;;) - { - Token token = lexToken(); - tokenList.mTokens.Add(token); + Token token = lexToken(); + tokenList.mTokens.Add(token); - if(token.Type == TokenType::EndOfFile) - return tokenList; - } + if(token.Type == TokenType::EndOfFile) + return tokenList; } + } #if 0 - TokenList Lexer::Parse(const String & fileName, const String & str, DiagnosticSink * sink) + TokenList Lexer::Parse(const String & fileName, const String & str, DiagnosticSink * sink) + { + TokenList tokenList; + tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, CodePosition pos) { - TokenList tokenList; - tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, CodePosition pos) + auto curChar = str[pos.Pos]; + switch (errType) { - auto curChar = str[pos.Pos]; - switch (errType) + case TokenizeErrorType::InvalidCharacter: + // Check if inside the ASCII "printable" range + if(curChar >= 0x20 && curChar <= 0x7E) { - case TokenizeErrorType::InvalidCharacter: - // Check if inside the ASCII "printable" range - if(curChar >= 0x20 && curChar <= 0x7E) - { - char buffer[] = { curChar, 0 }; - sink->diagnose(pos, Diagnostics::illegalCharacterPrint, buffer); - } - else - { - // Fallback: print as hexadecimal - sink->diagnose(pos, Diagnostics::illegalCharacterHex, String((unsigned char)curChar, 16)); - } - break; - case TokenizeErrorType::InvalidEscapeSequence: - sink->diagnose(pos, Diagnostics::illegalCharacterLiteral); - break; - default: - break; + char buffer[] = { curChar, 0 }; + sink->diagnose(pos, Diagnostics::illegalCharacterPrint, buffer); + } + else + { + // Fallback: print as hexadecimal + sink->diagnose(pos, Diagnostics::illegalCharacterHex, String((unsigned char)curChar, 16)); } - }); + break; + case TokenizeErrorType::InvalidEscapeSequence: + sink->diagnose(pos, Diagnostics::illegalCharacterLiteral); + break; + default: + break; + } + }); - // Add an end-of-file token so that we can reference it in diagnostic messages - tokenList.mTokens.Add(Token(TokenType::EndOfFile, "", 0, 0, 0, fileName, TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace)); - return tokenList; - } -#endif + // Add an end-of-file token so that we can reference it in diagnostic messages + tokenList.mTokens.Add(Token(TokenType::EndOfFile, "", 0, 0, 0, fileName, TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace)); + return tokenList; } +#endif }
\ No newline at end of file |
