From fa31d21ba92669a521a7768467246918e3947e02 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Thu, 1 Apr 2021 13:39:11 -0400 Subject: Added compiler-core project (#1775) * #include an absolute path didn't work - because paths were taken to always be relative. * Split out compiler-core initially with just slang-source-loc.cpp * More lexer, name, token to compiler-core. * Split Lexer and Core diagnostics. * Move slang-file-system to core. * Add slang-file-system to core. * More DownstreamCompiler into compiler-core * Fix typo. * Add compiler-core to bootstrap proj. * Small fixes to premake * For linux try with compiler-core * Remove compiler-core from examples. * Added NameConventionUtil to compiler-core * Add global function to CharUtil to *hopefully* avoid linking issue. * Hack to make linkage of CharUtil work on linux. --- source/compiler-core/slang-lexer.cpp | 1400 ++++++++++++++++++++++++++++++++++ 1 file changed, 1400 insertions(+) create mode 100644 source/compiler-core/slang-lexer.cpp (limited to 'source/compiler-core/slang-lexer.cpp') diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp new file mode 100644 index 000000000..83b8e0eec --- /dev/null +++ b/source/compiler-core/slang-lexer.cpp @@ -0,0 +1,1400 @@ +// slang-lexer.cpp +#include "slang-lexer.h" + +// This file implements the lexer/scanner, which is responsible for taking a raw stream of +// input bytes and turning it into semantically useful tokens. +// + +#include "slang-name.h" +#include "slang-source-loc.h" + +#include "slang-core-diagnostics.h" + +namespace Slang +{ + Token TokenReader::getEndOfFileToken() + { + return Token(TokenType::EndOfFile, UnownedStringSlice::fromLiteral(""), SourceLoc()); + } + + const Token* TokenList::begin() const + { + SLANG_ASSERT(m_tokens.getCount()); + return &m_tokens[0]; + } + + const Token* TokenList::end() const + { + SLANG_ASSERT(m_tokens.getCount()); + SLANG_ASSERT(m_tokens[m_tokens.getCount() - 1].type == TokenType::EndOfFile); + return &m_tokens[m_tokens.getCount() - 1]; + } + + TokenSpan::TokenSpan() + : m_begin(nullptr) + , m_end (nullptr) + {} + + TokenReader::TokenReader() + : m_cursor(nullptr) + , m_end (nullptr) + {} + + + Token& TokenReader::peekToken() + { + return m_nextToken; + } + + TokenType TokenReader::peekTokenType() const + { + return m_nextToken.type; + } + + SourceLoc TokenReader::peekLoc() const + { + return m_nextToken.loc; + } + + Token TokenReader::advanceToken() + { + if (!m_cursor) + return getEndOfFileToken(); + + Token token = m_nextToken; + if (m_cursor < m_end) + { + m_cursor++; + m_nextToken = *m_cursor; + } + else + m_nextToken.type = TokenType::EndOfFile; + return token; + } + + // Lexer + + void Lexer::initialize( + SourceView* sourceView, + DiagnosticSink* sink, + NamePool* namePool, + MemoryArena* memoryArena, + OptionFlags optionFlags) + { + m_sourceView = sourceView; + m_sink = sink; + m_namePool = namePool; + m_memoryArena = memoryArena; + + auto content = sourceView->getContent(); + + m_begin = content.begin(); + m_cursor = content.begin(); + m_end = content.end(); + + // Set the start location + m_startLoc = sourceView->getRange().begin; + + m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + m_lexerFlags = 0; + m_optionFlags = optionFlags; + } + + Lexer::~Lexer() + { + } + + enum { kEOF = -1 }; + + // Get the next input byte, without any handling of + // escaped newlines, non-ASCII code points, source locations, etc. + static int _peekRaw(Lexer* lexer) + { + // If we are at the end of the input, return a designated end-of-file value + if(lexer->m_cursor == lexer->m_end) + return kEOF; + + // Otherwise, just look at the next byte + return *lexer->m_cursor; + } + + // Read one input byte without any special handling (similar to `peekRaw`) + static int _advanceRaw(Lexer* lexer) + { + // The logic here is basically the same as for `peekRaw()`, + // escape we advance `cursor` if we aren't at the end. + + if (lexer->m_cursor == lexer->m_end) + return kEOF; + + return *lexer->m_cursor++; + } + + // When the cursor is already at the first byte of an end-of-line sequence, + // consume one or two bytes that compose the sequence. + // + // Basically, a newline is one of: + // + // "\n" + // "\r" + // "\r\n" + // "\n\r" + // + // We always look for the longest match possible. + // + static void _handleNewLineInner(Lexer* lexer, int c) + { + SLANG_ASSERT(c == '\n' || c == '\r'); + + int d = _peekRaw(lexer); + if( (c ^ d) == ('\n' ^ '\r') ) + { + _advanceRaw(lexer); + } + } + + // Look ahead one code point, dealing with complications like + // escaped newlines. + static int _peek(Lexer* lexer) + { + // Look at the next raw byte, and decide what to do + int c = _peekRaw(lexer); + + if(c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = lexer->m_cursor[1]; + switch (d) + { + case '\r': case '\n': + { + // The newline was escaped, so return the code point after *that* + + int e = lexer->m_cursor[2]; + if ((d ^ e) == ('\r' ^ '\n')) + return lexer->m_cursor[3]; + return e; + } + + default: + break; + } + } + // TODO: handle UTF-8 encoding for non-ASCII code points here + + // Default case is to just hand along the byte we read as an ASCII code point. + return c; + } + + // Get the next code point from the input, and advance the cursor. + static int _advance(Lexer* lexer) + { + // We are going to loop, but only as a way of handling + // escaped line endings. + for (;;) + { + // If we are at the end of the input, then the task is easy. + if (lexer->m_cursor == lexer->m_end) + return kEOF; + + // Look at the next raw byte, and decide what to do + int c = *lexer->m_cursor++; + + if (c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = *lexer->m_cursor; + switch (d) + { + case '\r': case '\n': + // handle the end-of-line for our source location tracking + lexer->m_cursor++; + _handleNewLineInner(lexer, d); + + lexer->m_tokenFlags |= TokenFlag::ScrubbingNeeded; + + // Now try again, looking at the character after the + // escaped newline. + continue; + + default: + break; + } + } + + // TODO: Need to handle non-ASCII code points. + + // Default case is to return the raw byte we saw. + return c; + } + } + + static void _handleNewLine(Lexer* lexer) + { + int c = _advance(lexer); + _handleNewLineInner(lexer, c); + } + + static void _lexLineComment(Lexer* lexer) + { + for(;;) + { + switch(_peek(lexer)) + { + case '\n': case '\r': case kEOF: + return; + + default: + _advance(lexer); + continue; + } + } + } + + static void _lexBlockComment(Lexer* lexer) + { + for(;;) + { + switch(_peek(lexer)) + { + case kEOF: + // TODO(tfoley) diagnostic! + return; + + case '\n': case '\r': + _handleNewLine(lexer); + continue; + + case '*': + _advance(lexer); + switch( _peek(lexer) ) + { + case '/': + _advance(lexer); + return; + + default: + continue; + } + + default: + _advance(lexer); + continue; + } + } + } + + static void _lexHorizontalSpace(Lexer* lexer) + { + for(;;) + { + switch(_peek(lexer)) + { + case ' ': case '\t': + _advance(lexer); + continue; + + default: + return; + } + } + } + + static void _lexIdentifier(Lexer* lexer) + { + for(;;) + { + int c = _peek(lexer); + if(('a' <= c ) && (c <= 'z') + || ('A' <= c) && (c <= 'Z') + || ('0' <= c) && (c <= '9') + || (c == '_')) + { + _advance(lexer); + continue; + } + + return; + } + } + + static SourceLoc _getSourceLoc(Lexer* lexer) + { + return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin); + } + + static void _lexDigits(Lexer* lexer, int base) + { + for(;;) + { + int c = _peek(lexer); + + int digitVal = 0; + switch(c) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + digitVal = c - '0'; + break; + + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + if(base <= 10) return; + digitVal = 10 + c - 'a'; + break; + + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + if(base <= 10) return; + digitVal = 10 + c - 'A'; + break; + + default: + // Not more digits! + return; + } + + if(digitVal >= base) + { + char buffer[] = { (char) c, 0 }; + lexer->m_sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::invalidDigitForBase, buffer, base); + } + + _advance(lexer); + } + } + + static TokenType _maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType) + { + // Be liberal in what we accept here, so that figuring out + // the semantics of a numeric suffix is left up to the parser + // and semantic checking logic. + // + for( ;;) + { + int c = _peek(lexer); + + // Accept any alphanumeric character, plus underscores. + if(('a' <= c ) && (c <= 'z') + || ('A' <= c) && (c <= 'Z') + || ('0' <= c) && (c <= '9') + || (c == '_')) + { + _advance(lexer); + continue; + } + + // Stop at the first character that isn't + // alphanumeric. + return tokenType; + } + } + + static bool _isNumberExponent(int c, int base) + { + switch( c ) + { + default: + return false; + + case 'e': case 'E': + if(base != 10) return false; + break; + + case 'p': case 'P': + if(base != 16) return false; + break; + } + + return true; + } + + static bool _maybeLexNumberExponent(Lexer* lexer, int base) + { + if(!_isNumberExponent(_peek(lexer), base)) + return false; + + // we saw an exponent marker + _advance(lexer); + + // Now start to read the exponent + switch( _peek(lexer) ) + { + case '+': case '-': + _advance(lexer); + break; + } + + // TODO(tfoley): it would be an error to not see digits here... + + _lexDigits(lexer, 10); + + return true; + } + + static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base) + { + _lexDigits(lexer, base); + _maybeLexNumberExponent(lexer, base); + + return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral); + } + + static TokenType _lexNumber(Lexer* lexer, int base) + { + // TODO(tfoley): Need to consider whehter to allow any kind of digit separator character. + + TokenType tokenType = TokenType::IntegerLiteral; + + // At the start of things, we just concern ourselves with digits + _lexDigits(lexer, base); + + if( _peek(lexer) == '.' ) + { + tokenType = TokenType::FloatingPointLiteral; + + _advance(lexer); + _lexDigits(lexer, base); + } + + if( _maybeLexNumberExponent(lexer, base)) + { + tokenType = TokenType::FloatingPointLiteral; + } + + _maybeLexNumberSuffix(lexer, tokenType); + return tokenType; + } + + static int _maybeReadDigit(char const** ioCursor, int base) + { + auto& cursor = *ioCursor; + + for(;;) + { + int c = *cursor; + switch(c) + { + default: + return -1; + + // TODO: need to decide on digit separator characters + case '_': + cursor++; + continue; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + cursor++; + return c - '0'; + + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + if(base > 10) + { + cursor++; + return 10 + c - 'a'; + } + return -1; + + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + if(base > 10) + { + cursor++; + return 10 + c - 'A'; + } + return -1; + } + } + } + + static int _readOptionalBase(char const** ioCursor) + { + auto& cursor = *ioCursor; + if( *cursor == '0' ) + { + cursor++; + switch(*cursor) + { + case 'x': case 'X': + cursor++; + return 16; + + case 'b': case 'B': + cursor++; + return 2; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return 8; + + default: + return 10; + } + } + + return 10; + } + + + + IntegerLiteralValue getIntegerLiteralValue(Token const& token, UnownedStringSlice* outSuffix) + { + IntegerLiteralValue value = 0; + + const UnownedStringSlice content = token.getContent(); + + char const* cursor = content.begin(); + char const* end = content.end(); + + int base = _readOptionalBase(&cursor); + + for( ;;) + { + int digit = _maybeReadDigit(&cursor, base); + if(digit < 0) + break; + + value = value*base + digit; + } + + if(outSuffix) + { + *outSuffix = UnownedStringSlice(cursor, end); + } + + return value; + } + + FloatingPointLiteralValue getFloatingPointLiteralValue(Token const& token, UnownedStringSlice* outSuffix) + { + FloatingPointLiteralValue value = 0; + + const UnownedStringSlice content = token.getContent(); + + char const* cursor = content.begin(); + char const* end = content.end(); + + int radix = _readOptionalBase(&cursor); + + bool seenDot = false; + FloatingPointLiteralValue divisor = 1; + for( ;;) + { + if(*cursor == '.') + { + cursor++; + seenDot = true; + continue; + } + + int digit = _maybeReadDigit(&cursor, radix); + if(digit < 0) + break; + + value = value*radix + digit; + + if(seenDot) + { + divisor *= radix; + } + } + + // Now read optional exponent + if(_isNumberExponent(*cursor, radix)) + { + cursor++; + + bool exponentIsNegative = false; + switch(*cursor) + { + default: + break; + + case '-': + exponentIsNegative = true; + cursor++; + break; + + case '+': + cursor++; + break; + } + + int exponentRadix = 10; + int exponent = 0; + + for(;;) + { + int digit = _maybeReadDigit(&cursor, exponentRadix); + if(digit < 0) + break; + + exponent = exponent*exponentRadix + digit; + } + + FloatingPointLiteralValue exponentBase = 10; + if(radix == 16) + { + exponentBase = 2; + } + + FloatingPointLiteralValue exponentValue = pow(exponentBase, exponent); + + if( exponentIsNegative ) + { + divisor *= exponentValue; + } + else + { + value *= exponentValue; + } + } + + value /= divisor; + + if(outSuffix) + { + *outSuffix = UnownedStringSlice(cursor, end); + } + + return value; + } + + static void _lexStringLiteralBody(Lexer* lexer, char quote) + { + for(;;) + { + int c = _peek(lexer); + if(c == quote) + { + _advance(lexer); + return; + } + + switch(c) + { + case kEOF: + lexer->m_sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); + return; + + case '\n': case '\r': + lexer->m_sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral); + return; + + case '\\': + // Need to handle various escape sequence cases + _advance(lexer); + switch(_peek(lexer)) + { + case '\'': + case '\"': + case '\\': + case '?': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + _advance(lexer); + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + // octal escape: up to 3 characters + _advance(lexer); + for(int ii = 0; ii < 3; ++ii) + { + int d = _peek(lexer); + if(('0' <= d) && (d <= '7')) + { + _advance(lexer); + continue; + } + else + { + break; + } + } + break; + + case 'x': + // hexadecimal escape: any number of characters + _advance(lexer); + for(;;) + { + int d = _peek(lexer); + if(('0' <= d) && (d <= '9') + || ('a' <= d) && (d <= 'f') + || ('A' <= d) && (d <= 'F')) + { + _advance(lexer); + continue; + } + else + { + break; + } + } + break; + + // TODO: Unicode escape sequences + + } + break; + + default: + _advance(lexer); + continue; + } + } + } + + String getStringLiteralTokenValue(Token const& token) + { + SLANG_ASSERT(token.type == TokenType::StringLiteral + || token.type == TokenType::CharLiteral); + + const UnownedStringSlice content = token.getContent(); + + char const* cursor = content.begin(); + char const* end = content.end(); + SLANG_UNREFERENCED_VARIABLE(end); + + auto quote = *cursor++; + SLANG_ASSERT(quote == '\'' || quote == '"'); + + StringBuilder valueBuilder; + for(;;) + { + SLANG_ASSERT(cursor != end); + + auto c = *cursor++; + + // If we see a closing quote, then we are at the end of the string literal + if(c == quote) + { + SLANG_ASSERT(cursor == end); + return valueBuilder.ProduceString(); + } + + // Characters that don't being escape sequences are easy; + // just append them to the buffer and move on. + if(c != '\\') + { + valueBuilder.Append(c); + continue; + } + + // Now we look at another character to figure out the kind of + // escape sequence we are dealing with: + + char d = *cursor++; + + switch(d) + { + // Simple characters that just needed to be escaped + case '\'': + case '\"': + case '\\': + case '?': + valueBuilder.Append(d); + continue; + + // Traditional escape sequences for special characters + case 'a': valueBuilder.Append('\a'); continue; + case 'b': valueBuilder.Append('\b'); continue; + case 'f': valueBuilder.Append('\f'); continue; + case 'n': valueBuilder.Append('\n'); continue; + case 'r': valueBuilder.Append('\r'); continue; + case 't': valueBuilder.Append('\t'); continue; + case 'v': valueBuilder.Append('\v'); continue; + + // Octal escape: up to 3 characterws + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + { + cursor--; + int value = 0; + for(int ii = 0; ii < 3; ++ii) + { + d = *cursor; + if(('0' <= d) && (d <= '7')) + { + value = value*8 + (d - '0'); + + cursor++; + continue; + } + else + { + break; + } + } + + // TODO: add support for appending an arbitrary code point? + valueBuilder.Append((char) value); + } + continue; + + // Hexadecimal escape: any number of characters + case 'x': + { + cursor--; + int value = 0; + for(;;) + { + d = *cursor++; + int digitValue = 0; + if(('0' <= d) && (d <= '9')) + { + digitValue = d - '0'; + } + else if( ('a' <= d) && (d <= 'f') ) + { + digitValue = d - 'a'; + } + else if( ('A' <= d) && (d <= 'F') ) + { + digitValue = d - 'A'; + } + else + { + cursor--; + break; + } + + value = value*16 + digitValue; + } + + // TODO: add support for appending an arbitrary code point? + valueBuilder.Append((char) value); + } + continue; + + // TODO: Unicode escape sequences + + } + } + } + + String getFileNameTokenValue(Token const& token) + { + const UnownedStringSlice content = token.getContent(); + + // A file name usually doesn't process escape sequences + // (this is import on Windows, where `\\` is a valid + // path separator character). + + // Just trim off the first and last characters to remove the quotes + // (whether they were `""` or `<>`. + return String(content.begin() + 1, content.end() - 1); + } + + + + static TokenType _lexTokenImpl(Lexer* lexer, LexerFlags effectiveFlags) + { + if(effectiveFlags & kLexerFlag_ExpectDirectiveMessage) + { + for(;;) + { + switch(_peek(lexer)) + { + default: + _advance(lexer); + continue; + + case kEOF: case '\r': case '\n': + break; + } + break; + } + return TokenType::DirectiveMessage; + } + + switch(_peek(lexer)) + { + default: + break; + + case kEOF: + if((effectiveFlags & kLexerFlag_InDirective) != 0) + return TokenType::EndOfDirective; + return TokenType::EndOfFile; + + case '\r': case '\n': + if((effectiveFlags & kLexerFlag_InDirective) != 0) + return TokenType::EndOfDirective; + _handleNewLine(lexer); + return TokenType::NewLine; + + case ' ': case '\t': + _lexHorizontalSpace(lexer); + return TokenType::WhiteSpace; + + case '.': + _advance(lexer); + switch(_peek(lexer)) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return _lexNumberAfterDecimalPoint(lexer, 10); + + // TODO(tfoley): handle ellipsis (`...`) + + default: + return TokenType::Dot; + } + + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return _lexNumber(lexer, 10); + + case '0': + { + auto loc = _getSourceLoc(lexer); + _advance(lexer); + switch(_peek(lexer)) + { + default: + return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral); + + case '.': + _advance(lexer); + return _lexNumberAfterDecimalPoint(lexer, 10); + + case 'x': case 'X': + _advance(lexer); + return _lexNumber(lexer, 16); + + case 'b': case 'B': + _advance(lexer); + return _lexNumber(lexer, 2); + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + lexer->m_sink->diagnose(loc, LexerDiagnostics::octalLiteral); + return _lexNumber(lexer, 8); + } + } + + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '_': + _lexIdentifier(lexer); + return TokenType::Identifier; + + case '\"': + _advance(lexer); + _lexStringLiteralBody(lexer, '\"'); + return TokenType::StringLiteral; + + case '\'': + _advance(lexer); + _lexStringLiteralBody(lexer, '\''); + return TokenType::CharLiteral; + + case '+': + _advance(lexer); + switch(_peek(lexer)) + { + case '+': _advance(lexer); return TokenType::OpInc; + case '=': _advance(lexer); return TokenType::OpAddAssign; + default: + return TokenType::OpAdd; + } + + case '-': + _advance(lexer); + switch(_peek(lexer)) + { + case '-': _advance(lexer); return TokenType::OpDec; + case '=': _advance(lexer); return TokenType::OpSubAssign; + case '>': _advance(lexer); return TokenType::RightArrow; + default: + return TokenType::OpSub; + } + + case '*': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpMulAssign; + default: + return TokenType::OpMul; + } + + case '/': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpDivAssign; + case '/': _advance(lexer); _lexLineComment(lexer); return TokenType::LineComment; + case '*': _advance(lexer); _lexBlockComment(lexer); return TokenType::BlockComment; + default: + return TokenType::OpDiv; + } + + case '%': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpModAssign; + default: + return TokenType::OpMod; + } + + case '|': + _advance(lexer); + switch(_peek(lexer)) + { + case '|': _advance(lexer); return TokenType::OpOr; + case '=': _advance(lexer); return TokenType::OpOrAssign; + default: + return TokenType::OpBitOr; + } + + case '&': + _advance(lexer); + switch(_peek(lexer)) + { + case '&': _advance(lexer); return TokenType::OpAnd; + case '=': _advance(lexer); return TokenType::OpAndAssign; + default: + return TokenType::OpBitAnd; + } + + case '^': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpXorAssign; + default: + return TokenType::OpBitXor; + } + + case '>': + _advance(lexer); + switch(_peek(lexer)) + { + case '>': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpShrAssign; + default: return TokenType::OpRsh; + } + case '=': _advance(lexer); return TokenType::OpGeq; + default: + return TokenType::OpGreater; + } + + case '<': + _advance(lexer); + switch(_peek(lexer)) + { + case '<': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpShlAssign; + default: return TokenType::OpLsh; + } + case '=': _advance(lexer); return TokenType::OpLeq; + default: + return TokenType::OpLess; + } + + case '=': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpEql; + default: + return TokenType::OpAssign; + } + + case '!': + _advance(lexer); + switch(_peek(lexer)) + { + case '=': _advance(lexer); return TokenType::OpNeq; + default: + return TokenType::OpNot; + } + + case '#': + _advance(lexer); + switch(_peek(lexer)) + { + case '#': _advance(lexer); return TokenType::PoundPound; + default: + return TokenType::Pound; + } + + case '~': _advance(lexer); return TokenType::OpBitNot; + + case ':': + { + _advance(lexer); + if (_peek(lexer) == ':') + { + _advance(lexer); + return TokenType::Scope; + } + return TokenType::Colon; + } + case ';': _advance(lexer); return TokenType::Semicolon; + case ',': _advance(lexer); return TokenType::Comma; + + case '{': _advance(lexer); return TokenType::LBrace; + case '}': _advance(lexer); return TokenType::RBrace; + case '[': _advance(lexer); return TokenType::LBracket; + case ']': _advance(lexer); return TokenType::RBracket; + case '(': _advance(lexer); return TokenType::LParent; + case ')': _advance(lexer); return TokenType::RParent; + + case '?': _advance(lexer); return TokenType::QuestionMark; + case '@': _advance(lexer); return TokenType::At; + case '$': _advance(lexer); return TokenType::Dollar; + + } + + // TODO(tfoley): If we ever wanted to support proper Unicode + // in identifiers, etc., then this would be the right place + // to perform a more expensive dispatch based on the actual + // code point (and not just the first byte). + + { + // If none of the above cases matched, then we have an + // unexpected/invalid character. + + auto loc = _getSourceLoc(lexer); + int c = _advance(lexer); + if(!(effectiveFlags & kLexerFlag_IgnoreInvalid)) + { + auto sink = lexer->m_sink; + if(c >= 0x20 && c <= 0x7E) + { + char buffer[] = { (char) c, 0 }; + sink->diagnose(loc, LexerDiagnostics::illegalCharacterPrint, buffer); + } + else + { + // Fallback: print as hexadecimal + sink->diagnose(loc, LexerDiagnostics::illegalCharacterHex, String((unsigned char)c, 16)); + } + } + + return TokenType::Invalid; + } + } + + Token Lexer::lexToken(LexerFlags extraFlags) + { + auto& flags = m_tokenFlags; + for(;;) + { + Token token; + token.loc = _getSourceLoc(this); + + char const* textBegin = m_cursor; + + auto tokenType = _lexTokenImpl(this, m_lexerFlags | extraFlags); + + // The low-level lexer produces tokens for things we want + // to ignore, such as white space, so we skip them here. + switch(tokenType) + { + case TokenType::Invalid: + flags = 0; + continue; + + case TokenType::NewLine: + flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + continue; + + case TokenType::WhiteSpace: + { + flags |= TokenFlag::AfterWhitespace; + continue; + } + case TokenType::BlockComment: + case TokenType::LineComment: + { + flags |= TokenFlag::AfterWhitespace; + if (m_optionFlags & OptionFlag::TokenizeComments) + { + // We don't break here, and use the normal token adding logic + // because we want the behavior to be identical (in terms of flags etc) + // as if TokenizeComments is not enabled + char const* textEnd = m_cursor; + + token.type = tokenType; + token.flags = m_tokenFlags; + token.setContent(UnownedStringSlice(textBegin, textEnd)); + + return token; + } + + continue; + } + + // We don't want to skip the end-of-file token, but we *do* + // want to make sure it has appropriate flags to make our life easier + case TokenType::EndOfFile: + flags |= TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace; + break; + + // We will also do some book-keeping around preprocessor directives here: + // + // If we see a `#` at the start of a line, then we are entering a + // preprocessor directive. + case TokenType::Pound: + if((flags & TokenFlag::AtStartOfLine) != 0) + m_lexerFlags |= kLexerFlag_InDirective; + break; + // + // And if we saw an end-of-line during a directive, then we are + // now leaving that directive. + // + case TokenType::EndOfDirective: + m_lexerFlags &= ~kLexerFlag_InDirective; + break; + + default: + break; + } + + token.type = tokenType; + + char const* textEnd = m_cursor; + + // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes + if(textEnd != textBegin) + { + // "scrubbing" token value here to remove escaped newlines... + // + // Only perform this work if we encountered an escaped newline + // while lexing this token (e.g., keep a flag on the lexer), or + // do it on-demand when the actual value of the token is needed. + if (m_tokenFlags & TokenFlag::ScrubbingNeeded) + { + // Allocate space that will always be more than enough for stripped contents + char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin); + char* dst = startDst; + + auto tt = textBegin; + while (tt != textEnd) + { + char c = *tt++; + if (c == '\\') + { + char d = *tt; + switch (d) + { + case '\r': case '\n': + { + tt++; + char e = *tt; + if ((d ^ e) == ('\r' ^ '\n')) + { + tt++; + } + } + continue; + + default: + break; + } + } + *dst++ = c; + } + token.setContent(UnownedStringSlice(startDst, dst)); + } + else + { + token.setContent(UnownedStringSlice(textBegin, textEnd)); + } + } + + token.flags = flags; + + m_tokenFlags = 0; + + if (tokenType == TokenType::Identifier) + { + token.setName(m_namePool->getName(token.getContent())); + } + + return token; + } + } + + TokenList Lexer::lexAllTokens() + { + TokenList tokenList; + for(;;) + { + Token token = lexToken(); + tokenList.add(token); + + if(token.type == TokenType::EndOfFile) + return tokenList; + } + } + + /* static */UnownedStringSlice Lexer::sourceLocationLexer(const UnownedStringSlice& in) + { + Lexer lexer; + + SourceManager sourceManager; + sourceManager.initialize(nullptr, nullptr); + + auto sourceFile = sourceManager.createSourceFileWithString(PathInfo::makeUnknown(), in); + auto sourceView = sourceManager.createSourceView(sourceFile, nullptr, SourceLoc::fromRaw(0)); + + DiagnosticSink sink(&sourceManager, nullptr); + + MemoryArena arena; + + RootNamePool rootNamePool; + NamePool namePool; + namePool.setRootNamePool(&rootNamePool); + + lexer.initialize(sourceView, &sink, &namePool, &arena); + + Token tok = lexer.lexToken(); + + if (tok.type == TokenType::Invalid) + { + return UnownedStringSlice(); + } + + const int offset = sourceView->getRange().getOffset(tok.loc); + + SLANG_ASSERT(offset >= 0 && offset <= in.getLength()); + SLANG_ASSERT(Index(offset + tok.charsCount) <= in.getLength()); + + return UnownedStringSlice(in.begin() + offset, in.begin() + offset + tok.charsCount); + } + +} -- cgit v1.2.3