diff options
| author | Ellie Hermaszewska <ellieh@nvidia.com> | 2024-10-29 14:49:26 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-10-29 14:49:26 +0800 |
| commit | f65d756bff8d4c5cbc15bd0322a2ae8e6b896a21 (patch) | |
| tree | ea1d61342cd29368e19135000ec2948813096205 /source/compiler-core/slang-lexer.cpp | |
| parent | a729c15e9dce9f5116a38afc66329ab2ca4cea54 (diff) | |
format
* format
* Minor test fixes
* enable checking cpp format in ci
Diffstat (limited to 'source/compiler-core/slang-lexer.cpp')
| -rw-r--r-- | source/compiler-core/slang-lexer.cpp | 2484 |
1 files changed, 1271 insertions, 1213 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp index 366af9114..df95f5f1e 100644 --- a/source/compiler-core/slang-lexer.cpp +++ b/source/compiler-core/slang-lexer.cpp @@ -6,196 +6,197 @@ // #include "core/slang-char-encode.h" +#include "slang-core-diagnostics.h" #include "slang-name.h" #include "slang-source-loc.h" -#include "slang-core-diagnostics.h" namespace Slang { - Token TokenReader::getEndOfFileToken() - { - return Token(TokenType::EndOfFile, UnownedStringSlice::fromLiteral(""), SourceLoc()); - } +Token TokenReader::getEndOfFileToken() +{ + return Token(TokenType::EndOfFile, UnownedStringSlice::fromLiteral(""), SourceLoc()); +} - const Token* TokenList::begin() const - { - SLANG_ASSERT(m_tokens.getCount()); - return &m_tokens[0]; - } +const Token* TokenList::begin() const +{ + SLANG_ASSERT(m_tokens.getCount()); + return &m_tokens[0]; +} - const Token* TokenList::end() const - { - SLANG_ASSERT(m_tokens.getCount()); - SLANG_ASSERT(m_tokens[m_tokens.getCount() - 1].type == TokenType::EndOfFile); - return &m_tokens[m_tokens.getCount() - 1]; - } +const Token* TokenList::end() const +{ + SLANG_ASSERT(m_tokens.getCount()); + SLANG_ASSERT(m_tokens[m_tokens.getCount() - 1].type == TokenType::EndOfFile); + return &m_tokens[m_tokens.getCount() - 1]; +} - TokenSpan::TokenSpan() - : m_begin(nullptr) - , m_end (nullptr) - {} +TokenSpan::TokenSpan() + : m_begin(nullptr), m_end(nullptr) +{ +} - TokenReader::TokenReader() - : m_cursor(nullptr) - , m_end (nullptr) - { - _updateLookaheadToken(); - } +TokenReader::TokenReader() + : m_cursor(nullptr), m_end(nullptr) +{ + _updateLookaheadToken(); +} - Token& TokenReader::peekToken() - { - return m_nextToken; - } +Token& TokenReader::peekToken() +{ + return m_nextToken; +} - TokenType TokenReader::peekTokenType() const - { - return m_nextToken.type; - } +TokenType TokenReader::peekTokenType() const +{ + return m_nextToken.type; +} - SourceLoc TokenReader::peekLoc() const - { - return m_nextToken.loc; - } +SourceLoc TokenReader::peekLoc() const +{ + return m_nextToken.loc; +} - Token TokenReader::advanceToken() - { - Token result = m_nextToken; - if (m_cursor != m_end) - m_cursor++; - _updateLookaheadToken(); - return result; - } +Token TokenReader::advanceToken() +{ + Token result = m_nextToken; + if (m_cursor != m_end) + m_cursor++; + _updateLookaheadToken(); + return result; +} - void TokenReader::_updateLookaheadToken() - { - // We assume here that we can read a token from a non-null `m_cursor` - // *even* in the case where `m_cursor == m_end`, because the invariant - // for lists of tokens is that they should be terminated with and - // end-of-file token, so that there is always a token "one past the end." - // - m_nextToken = m_cursor ? *m_cursor : getEndOfFileToken(); +void TokenReader::_updateLookaheadToken() +{ + // We assume here that we can read a token from a non-null `m_cursor` + // *even* in the case where `m_cursor == m_end`, because the invariant + // for lists of tokens is that they should be terminated with and + // end-of-file token, so that there is always a token "one past the end." + // + m_nextToken = m_cursor ? *m_cursor : getEndOfFileToken(); - // If the token we read came from the end of the sub-sequence we are - // reading, then we will change the token type to an end-of-file token - // so that code that reads from the sequence and expects a terminating - // EOF will find it. - // - // TODO: We might eventually want a way to look at the actual token type - // and not just use EOF in all cases: e.g., when emitting diagnostic - // messages that include the token that is seen. - // - if(m_cursor == m_end) - m_nextToken.type = TokenType::EndOfFile; - } + // If the token we read came from the end of the sub-sequence we are + // reading, then we will change the token type to an end-of-file token + // so that code that reads from the sequence and expects a terminating + // EOF will find it. + // + // TODO: We might eventually want a way to look at the actual token type + // and not just use EOF in all cases: e.g., when emitting diagnostic + // messages that include the token that is seen. + // + if (m_cursor == m_end) + m_nextToken.type = TokenType::EndOfFile; +} - // Lexer +// Lexer - void Lexer::initialize( - SourceView* sourceView, - DiagnosticSink* sink, - NamePool* namePool, - MemoryArena* memoryArena) - { - m_sourceView = sourceView; - m_sink = sink; - m_namePool = namePool; - m_memoryArena = memoryArena; - - auto content = sourceView->getContent(); - - m_begin = content.begin(); - m_cursor = content.begin(); - m_end = content.end(); - - // Set the start location - m_startLoc = sourceView->getRange().begin; - - // The first token read from a translation unit should be considered to be at - // the start of a line, and *also* as coming after whitespace (conceptually - // both the end-of-file and beginning-of-file pseudo-tokens are whitespace). - // - m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; - m_lexerFlags = 0; - } +void Lexer::initialize( + SourceView* sourceView, + DiagnosticSink* sink, + NamePool* namePool, + MemoryArena* memoryArena) +{ + m_sourceView = sourceView; + m_sink = sink; + m_namePool = namePool; + m_memoryArena = memoryArena; - Lexer::~Lexer() - { - } + auto content = sourceView->getContent(); - enum { kEOF = -1 }; + m_begin = content.begin(); + m_cursor = content.begin(); + m_end = content.end(); - // Get the next input byte, without any handling of - // escaped newlines, non-ASCII code points, source locations, etc. - static int _peekRaw(Lexer* lexer) - { - // If we are at the end of the input, return a designated end-of-file value - if(lexer->m_cursor == lexer->m_end) - return kEOF; + // Set the start location + m_startLoc = sourceView->getRange().begin; - // Otherwise, just look at the next byte - return *lexer->m_cursor; - } + // The first token read from a translation unit should be considered to be at + // the start of a line, and *also* as coming after whitespace (conceptually + // both the end-of-file and beginning-of-file pseudo-tokens are whitespace). + // + m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + m_lexerFlags = 0; +} - // Read one input byte without any special handling (similar to `peekRaw`) - static int _advanceRaw(Lexer* lexer) - { - // The logic here is basically the same as for `peekRaw()`, - // escape we advance `cursor` if we aren't at the end. +Lexer::~Lexer() {} - if (lexer->m_cursor == lexer->m_end) - return kEOF; +enum +{ + kEOF = -1 +}; - return *lexer->m_cursor++; - } +// Get the next input byte, without any handling of +// escaped newlines, non-ASCII code points, source locations, etc. +static int _peekRaw(Lexer* lexer) +{ + // If we are at the end of the input, return a designated end-of-file value + if (lexer->m_cursor == lexer->m_end) + return kEOF; - // When the cursor is already at the first byte of an end-of-line sequence, - // consume one or two bytes that compose the sequence. - // - // Basically, a newline is one of: - // - // "\n" - // "\r" - // "\r\n" - // "\n\r" - // - // We always look for the longest match possible. - // - static void _handleNewLineInner(Lexer* lexer, int c) - { - SLANG_ASSERT(c == '\n' || c == '\r'); + // Otherwise, just look at the next byte + return *lexer->m_cursor; +} - int d = _peekRaw(lexer); - if( (c ^ d) == ('\n' ^ '\r') ) - { - _advanceRaw(lexer); - } - } +// Read one input byte without any special handling (similar to `peekRaw`) +static int _advanceRaw(Lexer* lexer) +{ + // The logic here is basically the same as for `peekRaw()`, + // escape we advance `cursor` if we aren't at the end. - // Look ahead one code point, dealing with complications like - // escaped newlines. - static int _peek(Lexer* lexer, int offset = 0) + if (lexer->m_cursor == lexer->m_end) + return kEOF; + + return *lexer->m_cursor++; +} + +// When the cursor is already at the first byte of an end-of-line sequence, +// consume one or two bytes that compose the sequence. +// +// Basically, a newline is one of: +// +// "\n" +// "\r" +// "\r\n" +// "\n\r" +// +// We always look for the longest match possible. +// +static void _handleNewLineInner(Lexer* lexer, int c) +{ + SLANG_ASSERT(c == '\n' || c == '\r'); + + int d = _peekRaw(lexer); + if ((c ^ d) == ('\n' ^ '\r')) { - int pos = 0; - int c = kEOF; + _advanceRaw(lexer); + } +} - do - { - if (lexer->m_cursor + pos == lexer->m_end) - return kEOF; +// Look ahead one code point, dealing with complications like +// escaped newlines. +static int _peek(Lexer* lexer, int offset = 0) +{ + int pos = 0; + int c = kEOF; + + do + { + if (lexer->m_cursor + pos == lexer->m_end) + return kEOF; - c = lexer->m_cursor[pos++]; + c = lexer->m_cursor[pos++]; - while (c == '\\') + while (c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = lexer->m_cursor[pos++]; + switch (d) { - // We might have a backslash-escaped newline. - // Look at the next byte (if any) to see. - // - // Note(tfoley): We are assuming a null-terminated input here, - // so that we can safely look at the next byte without issue. - int d = lexer->m_cursor[pos++]; - switch (d) - { - case '\r': case '\n': + case '\r': + case '\n': { // The newline was escaped, so return the code point after *that* int e = lexer->m_cursor[pos++]; @@ -205,1139 +206,1198 @@ namespace Slang c = e; continue; } - default: - break; - } - - // Only continue this while loop in the case where we consumed - // some newlines - break; - } - if (isUtf8LeadingByte((Byte)c)) - { - // Consume all unicode characters. - pos--; - c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; }); + default: break; } - // Default case is to just hand along the byte we read as an ASCII code point. - } while (offset--); - return c; - } + // Only continue this while loop in the case where we consumed + // some newlines + break; + } + if (isUtf8LeadingByte((Byte)c)) + { + // Consume all unicode characters. + pos--; + c = getUnicodePointFromUTF8([&]() { return lexer->m_cursor[pos++]; }); + } + // Default case is to just hand along the byte we read as an ASCII code point. + } while (offset--); + + return c; +} - // Get the next code point from the input, and advance the cursor. - static int _advance(Lexer* lexer) +// Get the next code point from the input, and advance the cursor. +static int _advance(Lexer* lexer) +{ + // We are going to loop, but only as a way of handling + // escaped line endings. + for (;;) { - // We are going to loop, but only as a way of handling - // escaped line endings. - for (;;) - { - // If we are at the end of the input, then the task is easy. - if (lexer->m_cursor == lexer->m_end) - return kEOF; + // If we are at the end of the input, then the task is easy. + if (lexer->m_cursor == lexer->m_end) + return kEOF; - // Look at the next raw byte, and decide what to do - int c = *lexer->m_cursor++; + // Look at the next raw byte, and decide what to do + int c = *lexer->m_cursor++; - if (c == '\\') + if (c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = *lexer->m_cursor; + switch (d) { - // We might have a backslash-escaped newline. - // Look at the next byte (if any) to see. - // - // Note(tfoley): We are assuming a null-terminated input here, - // so that we can safely look at the next byte without issue. - int d = *lexer->m_cursor; - switch (d) - { - case '\r': case '\n': - // handle the end-of-line for our source location tracking - lexer->m_cursor++; - _handleNewLineInner(lexer, d); - - lexer->m_tokenFlags |= TokenFlag::ScrubbingNeeded; + case '\r': + case '\n': + // handle the end-of-line for our source location tracking + lexer->m_cursor++; + _handleNewLineInner(lexer, d); - // Now try again, looking at the character after the - // escaped newline. - continue; + lexer->m_tokenFlags |= TokenFlag::ScrubbingNeeded; - default: - break; - } - } + // Now try again, looking at the character after the + // escaped newline. + continue; - // Consume all unicode characters. - if (isUtf8LeadingByte((Byte)c)) - { - lexer->m_cursor--; - c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; }); + default: break; } + } - // Default case is to return the raw byte we saw. - return c; + // Consume all unicode characters. + if (isUtf8LeadingByte((Byte)c)) + { + lexer->m_cursor--; + c = getUnicodePointFromUTF8([&]() { return *lexer->m_cursor++; }); } - } - static void _handleNewLine(Lexer* lexer) - { - int c = _advance(lexer); - _handleNewLineInner(lexer, c); + // Default case is to return the raw byte we saw. + return c; } +} + +static void _handleNewLine(Lexer* lexer) +{ + int c = _advance(lexer); + _handleNewLineInner(lexer, c); +} - static void _lexLineComment(Lexer* lexer) +static void _lexLineComment(Lexer* lexer) +{ + for (;;) { - for(;;) + switch (_peek(lexer)) { - switch(_peek(lexer)) - { - case '\n': case '\r': case kEOF: - return; + case '\n': + case '\r': + case kEOF: return; - default: - _advance(lexer); - continue; - } + default: _advance(lexer); continue; } } +} - static void _lexBlockComment(Lexer* lexer) +static void _lexBlockComment(Lexer* lexer) +{ + for (;;) { - for(;;) + switch (_peek(lexer)) { - switch(_peek(lexer)) - { - case kEOF: - // TODO(tfoley) diagnostic! - return; - - case '\n': case '\r': - _handleNewLine(lexer); - continue; + case kEOF: + // TODO(tfoley) diagnostic! + return; - case '*': - _advance(lexer); - switch( _peek(lexer) ) - { - case '/': - _advance(lexer); - return; + case '\n': + case '\r': _handleNewLine(lexer); continue; - default: - continue; - } + case '*': + _advance(lexer); + switch (_peek(lexer)) + { + case '/': _advance(lexer); return; - default: - _advance(lexer); - continue; + default: continue; } + + default: _advance(lexer); continue; } } +} - static void _lexHorizontalSpace(Lexer* lexer) +static void _lexHorizontalSpace(Lexer* lexer) +{ + for (;;) { - for(;;) + switch (_peek(lexer)) { - switch(_peek(lexer)) - { - case ' ': case '\t': - _advance(lexer); - continue; + case ' ': + case '\t': _advance(lexer); continue; - default: - return; - } + default: return; } } +} - static bool isNonAsciiCodePoint(unsigned int codePoint) - { - return codePoint != 0xFFFFFFFF && codePoint >= 0x80; - } +static bool isNonAsciiCodePoint(unsigned int codePoint) +{ + return codePoint != 0xFFFFFFFF && codePoint >= 0x80; +} - static void _lexIdentifier(Lexer* lexer) +static void _lexIdentifier(Lexer* lexer) +{ + for (;;) { - for(;;) + int c = _peek(lexer); + if (('a' <= c) && (c <= 'z') || ('A' <= c) && (c <= 'Z') || ('0' <= c) && (c <= '9') || + (c == '_') || isNonAsciiCodePoint((unsigned int)c)) { - int c = _peek(lexer); - if(('a' <= c ) && (c <= 'z') - || ('A' <= c) && (c <= 'Z') - || ('0' <= c) && (c <= '9') - || (c == '_') - || isNonAsciiCodePoint((unsigned int)c)) - { - _advance(lexer); - continue; - } - return; + _advance(lexer); + continue; } + return; } +} - static SourceLoc _getSourceLoc(Lexer* lexer) - { - return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin); - } +static SourceLoc _getSourceLoc(Lexer* lexer) +{ + return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin); +} - static void _lexDigits(Lexer* lexer, int base) +static void _lexDigits(Lexer* lexer, int base) +{ + for (;;) { - for(;;) - { - int c = _peek(lexer); - - int digitVal = 0; - switch(c) - { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - digitVal = c - '0'; - break; + int c = _peek(lexer); - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - if(base <= 10) return; - digitVal = 10 + c - 'a'; - break; - - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - if(base <= 10) return; - digitVal = 10 + c - 'A'; - break; - - default: - // Not more digits! + int digitVal = 0; + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': digitVal = c - '0'; break; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + if (base <= 10) return; - } + digitVal = 10 + c - 'a'; + break; - if(digitVal >= base) - { - if (auto sink = lexer->getDiagnosticSink()) - { - char buffer[] = { (char) c, 0 }; - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::invalidDigitForBase, buffer, base); - } - } + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + if (base <= 10) + return; + digitVal = 10 + c - 'A'; + break; - _advance(lexer); + default: + // Not more digits! + return; } - } - static TokenType _maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType) - { - // Be liberal in what we accept here, so that figuring out - // the semantics of a numeric suffix is left up to the parser - // and semantic checking logic. - // - for( ;;) + if (digitVal >= base) { - int c = _peek(lexer); - - // Accept any alphanumeric character, plus underscores. - if(('a' <= c ) && (c <= 'z') - || ('A' <= c) && (c <= 'Z') - || ('0' <= c) && (c <= '9') - || (c == '_')) + if (auto sink = lexer->getDiagnosticSink()) { - _advance(lexer); - continue; + char buffer[] = {(char)c, 0}; + sink->diagnose( + _getSourceLoc(lexer), + LexerDiagnostics::invalidDigitForBase, + buffer, + base); } - - // Stop at the first character that isn't - // alphanumeric. - return tokenType; } + + _advance(lexer); } +} - static bool _isNumberExponent(int c, int base) +static TokenType _maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType) +{ + // Be liberal in what we accept here, so that figuring out + // the semantics of a numeric suffix is left up to the parser + // and semantic checking logic. + // + for (;;) { - switch( c ) - { - default: - return false; - - case 'e': case 'E': - if(base != 10) return false; - break; + int c = _peek(lexer); - case 'p': case 'P': - if(base != 16) return false; - break; + // Accept any alphanumeric character, plus underscores. + if (('a' <= c) && (c <= 'z') || ('A' <= c) && (c <= 'Z') || ('0' <= c) && (c <= '9') || + (c == '_')) + { + _advance(lexer); + continue; } - return true; + // Stop at the first character that isn't + // alphanumeric. + return tokenType; } +} - static bool _maybeLexNumberExponent(Lexer* lexer, int base) +static bool _isNumberExponent(int c, int base) +{ + switch (c) { - if (_peek(lexer) == '#') - { - // Special case #INF - const auto inf = toSlice("#INF"); - for (auto c : inf) - { - if (_peek(lexer) != c) - { - return false; - } - _advance(lexer); - } + default: return false; - return true; - } + case 'e': + case 'E': + if (base != 10) + return false; + break; - if(!_isNumberExponent(_peek(lexer), base)) + case 'p': + case 'P': + if (base != 16) return false; + break; + } - // we saw an exponent marker - _advance(lexer); + return true; +} - // Now start to read the exponent - switch( _peek(lexer) ) +static bool _maybeLexNumberExponent(Lexer* lexer, int base) +{ + if (_peek(lexer) == '#') + { + // Special case #INF + const auto inf = toSlice("#INF"); + for (auto c : inf) { - case '+': case '-': + if (_peek(lexer) != c) + { + return false; + } _advance(lexer); - break; } - // TODO(tfoley): it would be an error to not see digits here... - - _lexDigits(lexer, 10); - return true; } - static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base) - { - _lexDigits(lexer, base); - _maybeLexNumberExponent(lexer, base); + if (!_isNumberExponent(_peek(lexer), base)) + return false; - return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral); - } + // we saw an exponent marker + _advance(lexer); - static TokenType _lexNumber(Lexer* lexer, int base) + // Now start to read the exponent + switch (_peek(lexer)) { - // TODO(tfoley): Need to consider whether to allow any kind of digit separator character. + case '+': + case '-': _advance(lexer); break; + } - TokenType tokenType = TokenType::IntegerLiteral; + // TODO(tfoley): it would be an error to not see digits here... - // At the start of things, we just concern ourselves with digits - _lexDigits(lexer, base); + _lexDigits(lexer, 10); - if( _peek(lexer) == '.' ) - { - switch (_peek(lexer, 1)) - { - // 123.xxxx or 123.rrrr - case 'x': - case 'r': - break; + return true; +} - default: - tokenType = TokenType::FloatingPointLiteral; +static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base) +{ + _lexDigits(lexer, base); + _maybeLexNumberExponent(lexer, base); - _advance(lexer); - _lexDigits(lexer, base); - } - } + return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral); +} + +static TokenType _lexNumber(Lexer* lexer, int base) +{ + // TODO(tfoley): Need to consider whether to allow any kind of digit separator character. + + TokenType tokenType = TokenType::IntegerLiteral; - if( _maybeLexNumberExponent(lexer, base)) + // At the start of things, we just concern ourselves with digits + _lexDigits(lexer, base); + + if (_peek(lexer) == '.') + { + switch (_peek(lexer, 1)) { + // 123.xxxx or 123.rrrr + case 'x': + case 'r': break; + + default: tokenType = TokenType::FloatingPointLiteral; - } - _maybeLexNumberSuffix(lexer, tokenType); - return tokenType; + _advance(lexer); + _lexDigits(lexer, base); + } } - static int _maybeReadDigit(char const** ioCursor, int base) + if (_maybeLexNumberExponent(lexer, base)) { - auto& cursor = *ioCursor; + tokenType = TokenType::FloatingPointLiteral; + } - for(;;) - { - int c = *cursor; - switch(c) - { - default: - return -1; + _maybeLexNumberSuffix(lexer, tokenType); + return tokenType; +} - // TODO: need to decide on digit separator characters - case '_': - cursor++; - continue; +static int _maybeReadDigit(char const** ioCursor, int base) +{ + auto& cursor = *ioCursor; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - cursor++; - return c - '0'; + for (;;) + { + int c = *cursor; + switch (c) + { + default: return -1; - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - if(base > 10) - { - cursor++; - return 10 + c - 'a'; - } - return -1; + // TODO: need to decide on digit separator characters + case '_': cursor++; continue; - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - if(base > 10) - { - cursor++; - return 10 + c - 'A'; - } - return -1; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': cursor++; return c - '0'; + + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + if (base > 10) + { + cursor++; + return 10 + c - 'a'; + } + return -1; + + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + if (base > 10) + { + cursor++; + return 10 + c - 'A'; } + return -1; } } +} - static int _readOptionalBase(char const** ioCursor) +static int _readOptionalBase(char const** ioCursor) +{ + auto& cursor = *ioCursor; + if (*cursor == '0') { - auto& cursor = *ioCursor; - if( *cursor == '0' ) + cursor++; + switch (*cursor) { - cursor++; - switch(*cursor) - { - case 'x': case 'X': - cursor++; - return 16; + case 'x': + case 'X': cursor++; return 16; - case 'b': case 'B': - cursor++; - return 2; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return 8; + case 'b': + case 'B': cursor++; return 2; - default: - return 10; - } + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': return 8; + + default: return 10; } - - return 10; } + return 10; +} - IntegerLiteralValue getIntegerLiteralValue(Token const& token, UnownedStringSlice* outSuffix) - { - IntegerLiteralValue value = 0; - - const UnownedStringSlice content = token.getContent(); - - char const* cursor = content.begin(); - char const* end = content.end(); +IntegerLiteralValue getIntegerLiteralValue(Token const& token, UnownedStringSlice* outSuffix) +{ + IntegerLiteralValue value = 0; - int base = _readOptionalBase(&cursor); + const UnownedStringSlice content = token.getContent(); - for( ;;) - { - int digit = _maybeReadDigit(&cursor, base); - if(digit < 0) - break; + char const* cursor = content.begin(); + char const* end = content.end(); - value = value*base + digit; - } + int base = _readOptionalBase(&cursor); - if(outSuffix) - { - *outSuffix = UnownedStringSlice(cursor, end); - } + for (;;) + { + int digit = _maybeReadDigit(&cursor, base); + if (digit < 0) + break; - return value; + value = value * base + digit; } - FloatingPointLiteralValue getFloatingPointLiteralValue(Token const& token, UnownedStringSlice* outSuffix) + if (outSuffix) { - FloatingPointLiteralValue value = 0; + *outSuffix = UnownedStringSlice(cursor, end); + } + + return value; +} + +FloatingPointLiteralValue getFloatingPointLiteralValue( + Token const& token, + UnownedStringSlice* outSuffix) +{ + FloatingPointLiteralValue value = 0; - const UnownedStringSlice content = token.getContent(); + const UnownedStringSlice content = token.getContent(); - char const* cursor = content.begin(); - char const* end = content.end(); + char const* cursor = content.begin(); + char const* end = content.end(); - int radix = _readOptionalBase(&cursor); + int radix = _readOptionalBase(&cursor); - bool seenDot = false; - FloatingPointLiteralValue divisor = 1; - for( ;;) + bool seenDot = false; + FloatingPointLiteralValue divisor = 1; + for (;;) + { + if (*cursor == '.') { - if(*cursor == '.') - { - cursor++; - seenDot = true; - continue; - } + cursor++; + seenDot = true; + continue; + } - int digit = _maybeReadDigit(&cursor, radix); - if(digit < 0) - break; + int digit = _maybeReadDigit(&cursor, radix); + if (digit < 0) + break; - value = value*radix + digit; + value = value * radix + digit; - if(seenDot) - { - divisor *= radix; - } + if (seenDot) + { + divisor *= radix; } + } - if (*cursor == '#') - { - // It must be INF - const auto inf = toSlice("#INF"); + if (*cursor == '#') + { + // It must be INF + const auto inf = toSlice("#INF"); - if (UnownedStringSlice(cursor, end).startsWith(inf)) + if (UnownedStringSlice(cursor, end).startsWith(inf)) + { + if (outSuffix) { - if(outSuffix) - { - *outSuffix = UnownedStringSlice(cursor + inf.getLength(), end); - } + *outSuffix = UnownedStringSlice(cursor + inf.getLength(), end); + } - value = INFINITY; + value = INFINITY; - return value; - } + return value; } + } + + // Now read optional exponent + if (_isNumberExponent(*cursor, radix)) + { + cursor++; - // Now read optional exponent - if(_isNumberExponent(*cursor, radix)) + bool exponentIsNegative = false; + switch (*cursor) { + default: break; + + case '-': + exponentIsNegative = true; cursor++; + break; - bool exponentIsNegative = false; - switch(*cursor) - { - default: - break; + case '+': cursor++; break; + } - case '-': - exponentIsNegative = true; - cursor++; - break; + int exponentRadix = 10; + int exponent = 0; - case '+': - cursor++; + for (;;) + { + int digit = _maybeReadDigit(&cursor, exponentRadix); + if (digit < 0) break; - } - - int exponentRadix = 10; - int exponent = 0; - - for(;;) - { - int digit = _maybeReadDigit(&cursor, exponentRadix); - if(digit < 0) - break; - - exponent = exponent*exponentRadix + digit; - } - - FloatingPointLiteralValue exponentBase = 10; - if(radix == 16) - { - exponentBase = 2; - } - FloatingPointLiteralValue exponentValue = pow(exponentBase, exponent); + exponent = exponent * exponentRadix + digit; + } - if( exponentIsNegative ) - { - divisor *= exponentValue; - } - else - { - value *= exponentValue; - } + FloatingPointLiteralValue exponentBase = 10; + if (radix == 16) + { + exponentBase = 2; } - value /= divisor; + FloatingPointLiteralValue exponentValue = pow(exponentBase, exponent); - if(outSuffix) + if (exponentIsNegative) { - *outSuffix = UnownedStringSlice(cursor, end); + divisor *= exponentValue; } + else + { + value *= exponentValue; + } + } - return value; + value /= divisor; + + if (outSuffix) + { + *outSuffix = UnownedStringSlice(cursor, end); } - static void _lexStringLiteralBody(Lexer* lexer, char quote) + return value; +} + +static void _lexStringLiteralBody(Lexer* lexer, char quote) +{ + for (;;) { - for(;;) + int c = _peek(lexer); + if (c == quote) + { + _advance(lexer); + return; + } + + switch (c) { - int c = _peek(lexer); - if(c == quote) + case kEOF: + if (auto sink = lexer->getDiagnosticSink()) { - _advance(lexer); - return; + sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); } + return; - switch(c) + case '\n': + case '\r': + if (auto sink = lexer->getDiagnosticSink()) { - case kEOF: - if (auto sink = lexer->getDiagnosticSink()) - { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); - } - return; + sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral); + } + return; - case '\n': case '\r': - if (auto sink = lexer->getDiagnosticSink()) + case '\\': + // Need to handle various escape sequence cases + _advance(lexer); + switch (_peek(lexer)) + { + case '\'': + case '\"': + case '\\': + case '?': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': _advance(lexer); break; + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + // octal escape: up to 3 characters + _advance(lexer); + for (int ii = 0; ii < 3; ++ii) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral); + int d = _peek(lexer); + if (('0' <= d) && (d <= '7')) + { + _advance(lexer); + continue; + } + else + { + break; + } } - return; + break; - case '\\': - // Need to handle various escape sequence cases + case 'x': + // hexadecimal escape: any number of characters _advance(lexer); - switch(_peek(lexer)) + for (;;) { - case '\'': - case '\"': - case '\\': - case '?': - case 'a': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - _advance(lexer); - break; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': - // octal escape: up to 3 characters - _advance(lexer); - for(int ii = 0; ii < 3; ++ii) + int d = _peek(lexer); + if (('0' <= d) && (d <= '9') || ('a' <= d) && (d <= 'f') || + ('A' <= d) && (d <= 'F')) { - int d = _peek(lexer); - if(('0' <= d) && (d <= '7')) - { - _advance(lexer); - continue; - } - else - { - break; - } + _advance(lexer); + continue; } - break; - - case 'x': - // hexadecimal escape: any number of characters - _advance(lexer); - for(;;) + else { - int d = _peek(lexer); - if(('0' <= d) && (d <= '9') - || ('a' <= d) && (d <= 'f') - || ('A' <= d) && (d <= 'F')) - { - _advance(lexer); - continue; - } - else - { - break; - } + break; } - break; - - // TODO: Unicode escape sequences - } break; - default: - _advance(lexer); - continue; + // TODO: Unicode escape sequences } + break; + + default: _advance(lexer); continue; } } +} - static void _lexRawStringLiteralBody(Lexer* lexer) +static void _lexRawStringLiteralBody(Lexer* lexer) +{ + const char* start = lexer->m_cursor; + const char* endOfDelimiter = nullptr; + for (;;) { - const char* start = lexer->m_cursor; - const char* endOfDelimiter = nullptr; - for (;;) + int c = _peek(lexer); + if (c == '(' && endOfDelimiter == nullptr) + endOfDelimiter = lexer->m_cursor; + if (c == '\"') { - int c = _peek(lexer); - if (c == '(' && endOfDelimiter == nullptr) - endOfDelimiter = lexer->m_cursor; - if (c == '\"') + if (!endOfDelimiter) { - if (!endOfDelimiter) + if (auto sink = lexer->getDiagnosticSink()) { - if (auto sink = lexer->getDiagnosticSink()) - { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::quoteCannotBeDelimiter); - } + sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::quoteCannotBeDelimiter); } - else + } + else + { + auto testStart = lexer->m_cursor - (endOfDelimiter - start); + if (testStart > endOfDelimiter) { - auto testStart = lexer->m_cursor - (endOfDelimiter - start); - if (testStart > endOfDelimiter) + auto testDelimiter = UnownedStringSlice(testStart, lexer->m_cursor); + auto delimiter = UnownedStringSlice(start, endOfDelimiter); + if (*(testStart - 1) == ')' && testDelimiter == delimiter) { - auto testDelimiter = UnownedStringSlice(testStart, lexer->m_cursor); - auto delimiter = UnownedStringSlice(start, endOfDelimiter); - if (*(testStart - 1) == ')' && testDelimiter == delimiter) - { - _advance(lexer); - return; - } + _advance(lexer); + return; } } } + } - switch (c) + switch (c) + { + case kEOF: + if (auto sink = lexer->getDiagnosticSink()) { - case kEOF: - if (auto sink = lexer->getDiagnosticSink()) - { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); - } - return; - default: - _advance(lexer); - continue; + sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); } + return; + default: _advance(lexer); continue; } } +} - UnownedStringSlice getRawStringLiteralTokenValue(Token const& token) - { - auto content = token.getContent(); - if (content.getLength() <= 5) - return UnownedStringSlice(); - auto start = content.begin() + 2; - auto delimEnd = start; - while (delimEnd < content.end() && *delimEnd != '(') - delimEnd++; - auto delimLength = delimEnd - start; - auto contentEnd = content.end() - delimLength - 2; - auto contentBegin = start + delimLength + 1; - if (contentEnd <= contentBegin) - return UnownedStringSlice(); - return UnownedStringSlice(contentBegin, contentEnd); - } +UnownedStringSlice getRawStringLiteralTokenValue(Token const& token) +{ + auto content = token.getContent(); + if (content.getLength() <= 5) + return UnownedStringSlice(); + auto start = content.begin() + 2; + auto delimEnd = start; + while (delimEnd < content.end() && *delimEnd != '(') + delimEnd++; + auto delimLength = delimEnd - start; + auto contentEnd = content.end() - delimLength - 2; + auto contentBegin = start + delimLength + 1; + if (contentEnd <= contentBegin) + return UnownedStringSlice(); + return UnownedStringSlice(contentBegin, contentEnd); +} - String getStringLiteralTokenValue(Token const& token) - { - SLANG_ASSERT(token.type == TokenType::StringLiteral - || token.type == TokenType::CharLiteral); +String getStringLiteralTokenValue(Token const& token) +{ + SLANG_ASSERT(token.type == TokenType::StringLiteral || token.type == TokenType::CharLiteral); - if (token.getContent().startsWith("R")) - return getRawStringLiteralTokenValue(token); + if (token.getContent().startsWith("R")) + return getRawStringLiteralTokenValue(token); - const UnownedStringSlice content = token.getContent(); + const UnownedStringSlice content = token.getContent(); - char const* cursor = content.begin(); - char const* end = content.end(); - SLANG_UNREFERENCED_VARIABLE(end); + char const* cursor = content.begin(); + char const* end = content.end(); + SLANG_UNREFERENCED_VARIABLE(end); - auto quote = *cursor++; - SLANG_ASSERT(quote == '\'' || quote == '"'); + auto quote = *cursor++; + SLANG_ASSERT(quote == '\'' || quote == '"'); - StringBuilder valueBuilder; - for(;;) - { - SLANG_ASSERT(cursor != end); + StringBuilder valueBuilder; + for (;;) + { + SLANG_ASSERT(cursor != end); - auto c = *cursor++; + auto c = *cursor++; - // If we see a closing quote, then we are at the end of the string literal - if(c == quote) - { - SLANG_ASSERT(cursor == end); - return valueBuilder.produceString(); - } + // If we see a closing quote, then we are at the end of the string literal + if (c == quote) + { + SLANG_ASSERT(cursor == end); + return valueBuilder.produceString(); + } - // Characters that don't being escape sequences are easy; - // just append them to the buffer and move on. - if(c != '\\') - { - valueBuilder.append(c); - continue; - } + // Characters that don't being escape sequences are easy; + // just append them to the buffer and move on. + if (c != '\\') + { + valueBuilder.append(c); + continue; + } - // Now we look at another character to figure out the kind of - // escape sequence we are dealing with: + // Now we look at another character to figure out the kind of + // escape sequence we are dealing with: - char d = *cursor++; + char d = *cursor++; - switch(d) + switch (d) + { + // Simple characters that just needed to be escaped + case '\'': + case '\"': + case '\\': + case '?': valueBuilder.append(d); continue; + + // Traditional escape sequences for special characters + case 'a': valueBuilder.append('\a'); continue; + case 'b': valueBuilder.append('\b'); continue; + case 'f': valueBuilder.append('\f'); continue; + case 'n': valueBuilder.append('\n'); continue; + case 'r': valueBuilder.append('\r'); continue; + case 't': valueBuilder.append('\t'); continue; + case 'v': valueBuilder.append('\v'); continue; + + // Octal escape: up to 3 characters + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': { - // Simple characters that just needed to be escaped - case '\'': - case '\"': - case '\\': - case '?': - valueBuilder.append(d); - continue; - - // Traditional escape sequences for special characters - case 'a': valueBuilder.append('\a'); continue; - case 'b': valueBuilder.append('\b'); continue; - case 'f': valueBuilder.append('\f'); continue; - case 'n': valueBuilder.append('\n'); continue; - case 'r': valueBuilder.append('\r'); continue; - case 't': valueBuilder.append('\t'); continue; - case 'v': valueBuilder.append('\v'); continue; - - // Octal escape: up to 3 characters - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': + cursor--; + int value = 0; + for (int ii = 0; ii < 3; ++ii) { - cursor--; - int value = 0; - for(int ii = 0; ii < 3; ++ii) + d = *cursor; + if (('0' <= d) && (d <= '7')) { - d = *cursor; - if(('0' <= d) && (d <= '7')) - { - value = value*8 + (d - '0'); + value = value * 8 + (d - '0'); - cursor++; - continue; - } - else - { - break; - } + cursor++; + continue; + } + else + { + break; } - - // TODO: add support for appending an arbitrary code point? - valueBuilder.append((char) value); } - continue; - // Hexadecimal escape: any number of characters - case 'x': + // TODO: add support for appending an arbitrary code point? + valueBuilder.append((char)value); + } + continue; + + // Hexadecimal escape: any number of characters + case 'x': + { + int value = 0; + for (;;) { - int value = 0; - for(;;) + d = *cursor++; + int digitValue = 0; + if (('0' <= d) && (d <= '9')) { - d = *cursor++; - int digitValue = 0; - if(('0' <= d) && (d <= '9')) - { - digitValue = d - '0'; - } - else if( ('a' <= d) && (d <= 'f') ) - { - digitValue = d - 'a'; - } - else if( ('A' <= d) && (d <= 'F') ) - { - digitValue = d - 'A'; - } - else - { - cursor--; - break; - } - - value = value*16 + digitValue; + digitValue = d - '0'; + } + else if (('a' <= d) && (d <= 'f')) + { + digitValue = d - 'a'; + } + else if (('A' <= d) && (d <= 'F')) + { + digitValue = d - 'A'; + } + else + { + cursor--; + break; } - // TODO: add support for appending an arbitrary code point? - valueBuilder.append((char) value); + value = value * 16 + digitValue; } - continue; - - // TODO: Unicode escape sequences + // TODO: add support for appending an arbitrary code point? + valueBuilder.append((char)value); } + continue; + + // TODO: Unicode escape sequences } } +} - String getFileNameTokenValue(Token const& token) - { - const UnownedStringSlice content = token.getContent(); +String getFileNameTokenValue(Token const& token) +{ + const UnownedStringSlice content = token.getContent(); - // A file name usually doesn't process escape sequences - // (this is import on Windows, where `\\` is a valid - // path separator character). + // A file name usually doesn't process escape sequences + // (this is import on Windows, where `\\` is a valid + // path separator character). - // Just trim off the first and last characters to remove the quotes - // (whether they were `""` or `<>`. - return String(content.begin() + 1, content.end() - 1); - } + // Just trim off the first and last characters to remove the quotes + // (whether they were `""` or `<>`. + return String(content.begin() + 1, content.end() - 1); +} - static TokenType _lexTokenImpl(Lexer* lexer) +static TokenType _lexTokenImpl(Lexer* lexer) +{ + int nextCodePoint = _peek(lexer); + switch (nextCodePoint) { - int nextCodePoint = _peek(lexer); - switch(nextCodePoint) - { - default: - break; + default: break; - case kEOF: - return TokenType::EndOfFile; + case kEOF: return TokenType::EndOfFile; + + case '\r': + case '\n': _handleNewLine(lexer); return TokenType::NewLine; - case '\r': case '\n': - _handleNewLine(lexer); - return TokenType::NewLine; + case ' ': + case '\t': _lexHorizontalSpace(lexer); return TokenType::WhiteSpace; - case ' ': case '\t': - _lexHorizontalSpace(lexer); - return TokenType::WhiteSpace; + case '.': + _advance(lexer); + switch (_peek(lexer)) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': return _lexNumberAfterDecimalPoint(lexer, 10); case '.': + // Note: consuming the second `.` here means that + // we cannot back up and return a `.` token by itself + // any more. We thus end up having distinct tokens for + // `.`, `..`, and `...` even though the `..` case is + // not part of HLSL. + // _advance(lexer); - switch(_peek(lexer)) + switch (_peek(lexer)) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return _lexNumberAfterDecimalPoint(lexer, 10); - - case '.': - // Note: consuming the second `.` here means that - // we cannot back up and return a `.` token by itself - // any more. We thus end up having distinct tokens for - // `.`, `..`, and `...` even though the `..` case is - // not part of HLSL. - // - _advance(lexer); - switch(_peek(lexer)) - { - case '.': - _advance(lexer); - return TokenType::Ellipsis; + case '.': _advance(lexer); return TokenType::Ellipsis; - default: - return TokenType::DotDot; - } - - default: - return TokenType::Dot; + default: return TokenType::DotDot; } - case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return _lexNumber(lexer, 10); + default: return TokenType::Dot; + } - case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': return _lexNumber(lexer, 10); + + case '0': + { + auto loc = _getSourceLoc(lexer); + _advance(lexer); + switch (_peek(lexer)) { - auto loc = _getSourceLoc(lexer); - _advance(lexer); - switch(_peek(lexer)) - { - default: - return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral); - - case '.': - switch (_peek(lexer, 1)) - { - // 0.xxxx or 0.rrrr - case 'x': - case 'r': - return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral); - default: - _advance(lexer); - return _lexNumberAfterDecimalPoint(lexer, 10); - } - - case 'x': case 'X': - _advance(lexer); - return _lexNumber(lexer, 16); + default: return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral); - case 'b': case 'B': - _advance(lexer); - return _lexNumber(lexer, 2); - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - if (auto sink = lexer->getDiagnosticSink()) - { - sink->diagnose(loc, LexerDiagnostics::octalLiteral); - } - return _lexNumber(lexer, 8); + case '.': + switch (_peek(lexer, 1)) + { + // 0.xxxx or 0.rrrr + case 'x': + case 'r': return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral); + default: _advance(lexer); return _lexNumberAfterDecimalPoint(lexer, 10); } - } - case 'a': case 'b': case 'c': case 'd': case 'e': - case 'f': case 'g': case 'h': case 'i': case 'j': - case 'k': case 'l': case 'm': case 'n': case 'o': - case 'p': case 'q': case 'r': case 's': case 't': - case 'u': case 'v': case 'w': case 'x': case 'y': - case 'z': - case 'A': case 'B': case 'C': case 'D': case 'E': - case 'F': case 'G': case 'H': case 'I': case 'J': - case 'K': case 'L': case 'M': case 'N': case 'O': - case 'P': case 'Q': case 'S': case 'T': - case 'U': case 'V': case 'W': case 'X': case 'Y': - case 'Z': - case '_': - _lexIdentifier(lexer); - return TokenType::Identifier; - case 'R': - _advance(lexer); - switch (_peek(lexer)) - { - default: - _lexIdentifier(lexer); - return TokenType::Identifier; - case '\"': - _advance(lexer); - _lexRawStringLiteralBody(lexer); - return TokenType::StringLiteral; + case 'x': + case 'X': _advance(lexer); return _lexNumber(lexer, 16); + + case 'b': + case 'B': _advance(lexer); return _lexNumber(lexer, 2); + + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (auto sink = lexer->getDiagnosticSink()) + { + sink->diagnose(loc, LexerDiagnostics::octalLiteral); + } + return _lexNumber(lexer, 8); } + } + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'g': + case 'h': + case 'i': + case 'j': + case 'k': + case 'l': + case 'm': + case 'n': + case 'o': + case 'p': + case 'q': + case 'r': + case 's': + case 't': + case 'u': + case 'v': + case 'w': + case 'x': + case 'y': + case 'z': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'G': + case 'H': + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'O': + case 'P': + case 'Q': + case 'S': + case 'T': + case 'U': + case 'V': + case 'W': + case 'X': + case 'Y': + case 'Z': + case '_': _lexIdentifier(lexer); return TokenType::Identifier; + case 'R': + _advance(lexer); + switch (_peek(lexer)) + { + default: _lexIdentifier(lexer); return TokenType::Identifier; case '\"': _advance(lexer); - _lexStringLiteralBody(lexer, '\"'); + _lexRawStringLiteralBody(lexer); return TokenType::StringLiteral; + } - case '\'': - _advance(lexer); - _lexStringLiteralBody(lexer, '\''); - return TokenType::CharLiteral; + case '\"': + _advance(lexer); + _lexStringLiteralBody(lexer, '\"'); + return TokenType::StringLiteral; + case '\'': + _advance(lexer); + _lexStringLiteralBody(lexer, '\''); + return TokenType::CharLiteral; - case '+': - _advance(lexer); - switch(_peek(lexer)) - { - case '+': _advance(lexer); return TokenType::OpInc; - case '=': _advance(lexer); return TokenType::OpAddAssign; - default: - return TokenType::OpAdd; - } - case '-': - _advance(lexer); - switch(_peek(lexer)) - { - case '-': _advance(lexer); return TokenType::OpDec; - case '=': _advance(lexer); return TokenType::OpSubAssign; - case '>': _advance(lexer); return TokenType::RightArrow; - default: - return TokenType::OpSub; - } + case '+': + _advance(lexer); + switch (_peek(lexer)) + { + case '+': _advance(lexer); return TokenType::OpInc; + case '=': _advance(lexer); return TokenType::OpAddAssign; + default: return TokenType::OpAdd; + } - case '*': - _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpMulAssign; - default: - return TokenType::OpMul; - } + case '-': + _advance(lexer); + switch (_peek(lexer)) + { + case '-': _advance(lexer); return TokenType::OpDec; + case '=': _advance(lexer); return TokenType::OpSubAssign; + case '>': _advance(lexer); return TokenType::RightArrow; + default: return TokenType::OpSub; + } + + case '*': + _advance(lexer); + switch (_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpMulAssign; + default: return TokenType::OpMul; + } + case '/': + _advance(lexer); + switch (_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpDivAssign; case '/': _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpDivAssign; - case '/': _advance(lexer); _lexLineComment(lexer); return TokenType::LineComment; - case '*': _advance(lexer); _lexBlockComment(lexer); return TokenType::BlockComment; - default: - return TokenType::OpDiv; - } - - case '%': + _lexLineComment(lexer); + return TokenType::LineComment; + case '*': _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpModAssign; - default: - return TokenType::OpMod; - } + _lexBlockComment(lexer); + return TokenType::BlockComment; + default: return TokenType::OpDiv; + } - case '|': - _advance(lexer); - switch(_peek(lexer)) - { - case '|': _advance(lexer); return TokenType::OpOr; - case '=': _advance(lexer); return TokenType::OpOrAssign; - default: - return TokenType::OpBitOr; - } + case '%': + _advance(lexer); + switch (_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpModAssign; + default: return TokenType::OpMod; + } - case '&': - _advance(lexer); - switch(_peek(lexer)) - { - case '&': _advance(lexer); return TokenType::OpAnd; - case '=': _advance(lexer); return TokenType::OpAndAssign; - default: - return TokenType::OpBitAnd; - } + case '|': + _advance(lexer); + switch (_peek(lexer)) + { + case '|': _advance(lexer); return TokenType::OpOr; + case '=': _advance(lexer); return TokenType::OpOrAssign; + default: return TokenType::OpBitOr; + } - case '^': - _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpXorAssign; - default: - return TokenType::OpBitXor; - } + case '&': + _advance(lexer); + switch (_peek(lexer)) + { + case '&': _advance(lexer); return TokenType::OpAnd; + case '=': _advance(lexer); return TokenType::OpAndAssign; + default: return TokenType::OpBitAnd; + } + case '^': + _advance(lexer); + switch (_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpXorAssign; + default: return TokenType::OpBitXor; + } + + case '>': + _advance(lexer); + switch (_peek(lexer)) + { case '>': _advance(lexer); - switch(_peek(lexer)) + switch (_peek(lexer)) { - case '>': - _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpShrAssign; - default: return TokenType::OpRsh; - } - case '=': _advance(lexer); return TokenType::OpGeq; - default: - return TokenType::OpGreater; + case '=': _advance(lexer); return TokenType::OpShrAssign; + default: return TokenType::OpRsh; } + case '=': _advance(lexer); return TokenType::OpGeq; + default: return TokenType::OpGreater; + } + case '<': + _advance(lexer); + switch (_peek(lexer)) + { case '<': _advance(lexer); - switch(_peek(lexer)) + switch (_peek(lexer)) { - case '<': - _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpShlAssign; - default: return TokenType::OpLsh; - } - case '=': _advance(lexer); return TokenType::OpLeq; - default: - return TokenType::OpLess; + case '=': _advance(lexer); return TokenType::OpShlAssign; + default: return TokenType::OpLsh; } + case '=': _advance(lexer); return TokenType::OpLeq; + default: return TokenType::OpLess; + } - case '=': - _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpEql; - default: - return TokenType::OpAssign; - } + case '=': + _advance(lexer); + switch (_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpEql; + default: return TokenType::OpAssign; + } - case '!': - _advance(lexer); - switch(_peek(lexer)) - { - case '=': _advance(lexer); return TokenType::OpNeq; - default: - return TokenType::OpNot; - } + case '!': + _advance(lexer); + switch (_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpNeq; + default: return TokenType::OpNot; + } - case '#': - _advance(lexer); - switch(_peek(lexer)) - { - case '#': _advance(lexer); return TokenType::PoundPound; + case '#': + _advance(lexer); + switch (_peek(lexer)) + { + case '#': _advance(lexer); return TokenType::PoundPound; - case '?': _advance(lexer); return TokenType::CompletionRequest; + case '?': _advance(lexer); return TokenType::CompletionRequest; - default: - return TokenType::Pound; - } + default: return TokenType::Pound; + } - case '~': _advance(lexer); return TokenType::OpBitNot; + case '~': _advance(lexer); return TokenType::OpBitNot; - case ':': + case ':': { _advance(lexer); if (_peek(lexer) == ':') @@ -1347,151 +1407,154 @@ namespace Slang } return TokenType::Colon; } - case ';': _advance(lexer); return TokenType::Semicolon; - case ',': _advance(lexer); return TokenType::Comma; - - case '{': _advance(lexer); return TokenType::LBrace; - case '}': _advance(lexer); return TokenType::RBrace; - case '[': _advance(lexer); return TokenType::LBracket; - case ']': _advance(lexer); return TokenType::RBracket; - case '(': _advance(lexer); return TokenType::LParent; - case ')': _advance(lexer); return TokenType::RParent; - - case '?': _advance(lexer); return TokenType::QuestionMark; - case '@': _advance(lexer); return TokenType::At; - case '$': + case ';': _advance(lexer); return TokenType::Semicolon; + case ',': _advance(lexer); return TokenType::Comma; + + case '{': _advance(lexer); return TokenType::LBrace; + case '}': _advance(lexer); return TokenType::RBrace; + case '[': _advance(lexer); return TokenType::LBracket; + case ']': _advance(lexer); return TokenType::RBracket; + case '(': _advance(lexer); return TokenType::LParent; + case ')': _advance(lexer); return TokenType::RParent; + + case '?': _advance(lexer); return TokenType::QuestionMark; + case '@': _advance(lexer); return TokenType::At; + case '$': { _advance(lexer); - if(_peek(lexer) == '$') + if (_peek(lexer) == '$') { _advance(lexer); return TokenType::DollarDollar; } return TokenType::Dollar; } + } - } - - // We treat all unicode characters as a part of an identifier. - if (isNonAsciiCodePoint(nextCodePoint)) - { - _lexIdentifier(lexer); - return TokenType::Identifier; - } + // We treat all unicode characters as a part of an identifier. + if (isNonAsciiCodePoint(nextCodePoint)) + { + _lexIdentifier(lexer); + return TokenType::Identifier; + } - { - // If none of the above cases matched, then we have an - // unexpected/invalid character. + { + // If none of the above cases matched, then we have an + // unexpected/invalid character. - auto loc = _getSourceLoc(lexer); - int c = _advance(lexer); + auto loc = _getSourceLoc(lexer); + int c = _advance(lexer); - if (auto sink = lexer->getDiagnosticSink()) + if (auto sink = lexer->getDiagnosticSink()) + { + if (c >= 0x20 && c <= 0x7E) { - if(c >= 0x20 && c <= 0x7E) - { - char buffer[] = { (char) c, 0 }; - sink->diagnose(loc, LexerDiagnostics::illegalCharacterPrint, buffer); - } - else if(c == kEOF) - { - sink->diagnose(loc, LexerDiagnostics::unexpectedEndOfInput); - } - else - { - // Fallback: print as hexadecimal - sink->diagnose(loc, LexerDiagnostics::illegalCharacterHex, String((unsigned char)c, 16)); - } + char buffer[] = {(char)c, 0}; + sink->diagnose(loc, LexerDiagnostics::illegalCharacterPrint, buffer); + } + else if (c == kEOF) + { + sink->diagnose(loc, LexerDiagnostics::unexpectedEndOfInput); + } + else + { + // Fallback: print as hexadecimal + sink->diagnose( + loc, + LexerDiagnostics::illegalCharacterHex, + String((unsigned char)c, 16)); } - - return TokenType::Invalid; } + + return TokenType::Invalid; } +} - Token Lexer::lexToken() +Token Lexer::lexToken() +{ + for (;;) { - for(;;) - { - Token token; - token.loc = _getSourceLoc(this); + Token token; + token.loc = _getSourceLoc(this); - char const* textBegin = m_cursor; + char const* textBegin = m_cursor; - auto tokenType = _lexTokenImpl(this); + auto tokenType = _lexTokenImpl(this); - // The flags on the token we just lexed will be based - // on the current state of the lexer. - // - auto tokenFlags = m_tokenFlags; - // - // Depending on what kind of token we just lexed, the - // flags that will be used for the *next* token might - // need to be updated. - // - switch(tokenType) + // The flags on the token we just lexed will be based + // on the current state of the lexer. + // + auto tokenFlags = m_tokenFlags; + // + // Depending on what kind of token we just lexed, the + // flags that will be used for the *next* token might + // need to be updated. + // + switch (tokenType) + { + case TokenType::NewLine: { - case TokenType::NewLine: - { - // If we just reached the end of a line, then the next token - // should count as being at the start of a line, and also after - // whitespace. - // - m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; - break; - } + // If we just reached the end of a line, then the next token + // should count as being at the start of a line, and also after + // whitespace. + // + m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + break; + } - case TokenType::WhiteSpace: - case TokenType::BlockComment: - case TokenType::LineComment: - { - // True horizontal whitespace and comments both count as whitespace. - // - // Note that a line comment does not include the terminating newline, - // we do not need to set `AtStartOfLine` here. - // - m_tokenFlags |= TokenFlag::AfterWhitespace; - break; - } - - default: - { - // If we read some token other then the above cases, then we are - // neither after whitespace nor at the start of a line. - // - m_tokenFlags = 0; - break; - } + case TokenType::WhiteSpace: + case TokenType::BlockComment: + case TokenType::LineComment: + { + // True horizontal whitespace and comments both count as whitespace. + // + // Note that a line comment does not include the terminating newline, + // we do not need to set `AtStartOfLine` here. + // + m_tokenFlags |= TokenFlag::AfterWhitespace; + break; + } + + default: + { + // If we read some token other then the above cases, then we are + // neither after whitespace nor at the start of a line. + // + m_tokenFlags = 0; + break; } + } - token.type = tokenType; - token.flags = tokenFlags; + token.type = tokenType; + token.flags = tokenFlags; - char const* textEnd = m_cursor; + char const* textEnd = m_cursor; - // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes - if(textEnd != textBegin) + // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes + if (textEnd != textBegin) + { + // "scrubbing" token value here to remove escaped newlines... + // + // Only perform this work if we encountered an escaped newline + // while lexing this token (e.g., keep a flag on the lexer), or + // do it on-demand when the actual value of the token is needed. + if (tokenFlags & TokenFlag::ScrubbingNeeded) { - // "scrubbing" token value here to remove escaped newlines... - // - // Only perform this work if we encountered an escaped newline - // while lexing this token (e.g., keep a flag on the lexer), or - // do it on-demand when the actual value of the token is needed. - if (tokenFlags & TokenFlag::ScrubbingNeeded) - { - // Allocate space that will always be more than enough for stripped contents - char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin); - char* dst = startDst; + // Allocate space that will always be more than enough for stripped contents + char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin); + char* dst = startDst; - auto tt = textBegin; - while (tt != textEnd) + auto tt = textBegin; + while (tt != textEnd) + { + char c = *tt++; + if (c == '\\') { - char c = *tt++; - if (c == '\\') + char d = *tt; + switch (d) { - char d = *tt; - switch (d) - { - case '\r': case '\n': + case '\r': + case '\n': { tt++; char e = *tt; @@ -1502,116 +1565,111 @@ namespace Slang } continue; - default: - break; - } + default: break; } - *dst++ = c; } - token.setContent(UnownedStringSlice(startDst, dst)); - } - else - { - token.setContent(UnownedStringSlice(textBegin, textEnd)); + *dst++ = c; } + token.setContent(UnownedStringSlice(startDst, dst)); } - - if (m_namePool) + else { - if (tokenType == TokenType::Identifier || tokenType == TokenType::CompletionRequest) - { - token.setName(m_namePool->getName(token.getContent())); - } + token.setContent(UnownedStringSlice(textBegin, textEnd)); } - - return token; } - } - TokenList Lexer::lexAllSemanticTokens() - { - TokenList tokenList; - for(;;) + if (m_namePool) { - Token token = lexToken(); - - // We are only interested intokens that are semantically - // significant, so we will skip over forms of whitespace - // and comments. - // - switch( token.type ) + if (tokenType == TokenType::Identifier || tokenType == TokenType::CompletionRequest) { - default: - break; - - case TokenType::WhiteSpace: - case TokenType::BlockComment: - case TokenType::LineComment: - case TokenType::NewLine: - continue; + token.setName(m_namePool->getName(token.getContent())); } - - tokenList.add(token); - if(token.type == TokenType::EndOfFile) - return tokenList; } + + return token; } +} - TokenList Lexer::lexAllMarkupTokens() +TokenList Lexer::lexAllSemanticTokens() +{ + TokenList tokenList; + for (;;) { - TokenList tokenList; - for(;;) - { - Token token = lexToken(); - switch( token.type ) - { - default: - break; + Token token = lexToken(); - case TokenType::WhiteSpace: - case TokenType::NewLine: - continue; - } + // We are only interested intokens that are semantically + // significant, so we will skip over forms of whitespace + // and comments. + // + switch (token.type) + { + default: break; - tokenList.add(token); - if(token.type == TokenType::EndOfFile) - return tokenList; + case TokenType::WhiteSpace: + case TokenType::BlockComment: + case TokenType::LineComment: + case TokenType::NewLine: continue; } + + tokenList.add(token); + if (token.type == TokenType::EndOfFile) + return tokenList; } +} - /* static */UnownedStringSlice Lexer::sourceLocationLexer(const UnownedStringSlice& in) +TokenList Lexer::lexAllMarkupTokens() +{ + TokenList tokenList; + for (;;) { - Lexer lexer; + Token token = lexToken(); + switch (token.type) + { + default: break; - SourceManager sourceManager; - sourceManager.initialize(nullptr, nullptr); + case TokenType::WhiteSpace: + case TokenType::NewLine: continue; + } - auto sourceFile = sourceManager.createSourceFileWithString(PathInfo::makeUnknown(), in); - auto sourceView = sourceManager.createSourceView(sourceFile, nullptr, SourceLoc::fromRaw(0)); + tokenList.add(token); + if (token.type == TokenType::EndOfFile) + return tokenList; + } +} - DiagnosticSink sink(&sourceManager, nullptr); +/* static */ UnownedStringSlice Lexer::sourceLocationLexer(const UnownedStringSlice& in) +{ + Lexer lexer; - MemoryArena arena; + SourceManager sourceManager; + sourceManager.initialize(nullptr, nullptr); - RootNamePool rootNamePool; - NamePool namePool; - namePool.setRootNamePool(&rootNamePool); + auto sourceFile = sourceManager.createSourceFileWithString(PathInfo::makeUnknown(), in); + auto sourceView = sourceManager.createSourceView(sourceFile, nullptr, SourceLoc::fromRaw(0)); - lexer.initialize(sourceView, &sink, &namePool, &arena); + DiagnosticSink sink(&sourceManager, nullptr); - Token tok = lexer.lexToken(); + MemoryArena arena; - if (tok.type == TokenType::Invalid) - { - return UnownedStringSlice(); - } + RootNamePool rootNamePool; + NamePool namePool; + namePool.setRootNamePool(&rootNamePool); - const int offset = sourceView->getRange().getOffset(tok.loc); + lexer.initialize(sourceView, &sink, &namePool, &arena); - SLANG_ASSERT(offset >= 0 && offset <= in.getLength()); - SLANG_ASSERT(Index(offset + tok.charsCount) <= in.getLength()); + Token tok = lexer.lexToken(); - return UnownedStringSlice(in.begin() + offset, in.begin() + offset + tok.charsCount); + if (tok.type == TokenType::Invalid) + { + return UnownedStringSlice(); } + const int offset = sourceView->getRange().getOffset(tok.loc); + + SLANG_ASSERT(offset >= 0 && offset <= in.getLength()); + SLANG_ASSERT(Index(offset + tok.charsCount) <= in.getLength()); + + return UnownedStringSlice(in.begin() + offset, in.begin() + offset + tok.charsCount); } + +} // namespace Slang |
