summaryrefslogtreecommitdiffstats
path: root/source/slang/lexer.cpp
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2019-05-31 17:20:37 -0400
committerGitHub <noreply@github.com>2019-05-31 17:20:37 -0400
commit6cbc3929a54d37bd23cb5efa8e3320ba02f78b2f (patch)
tree5a23cb47782e9e2a77762c90dd35da1005eba8d0 /source/slang/lexer.cpp
parentb81ff3ef968d1cc4e954b31a1812b3c391d17b02 (diff)
Use slang- prefix on slang compiler and core source (#973)
* Prefixing source files in source/slang with slang- * Prefix source in source/slang with slang- prefix. * Rename core source files with slang- prefix. * Update project files. * Fix problems from automatic merge.
Diffstat (limited to 'source/slang/lexer.cpp')
-rw-r--r--source/slang/lexer.cpp1334
1 files changed, 0 insertions, 1334 deletions
diff --git a/source/slang/lexer.cpp b/source/slang/lexer.cpp
deleted file mode 100644
index 8cb9fa5ee..000000000
--- a/source/slang/lexer.cpp
+++ /dev/null
@@ -1,1334 +0,0 @@
-// lexer.cpp
-#include "lexer.h"
-
-// This file implements the lexer/scanner, which is responsible for taking a raw stream of
-// input bytes and turning it into semantically useful tokens.
-//
-
-#include "compiler.h"
-#include "source-loc.h"
-
-#include <assert.h>
-
-namespace Slang
-{
- Token TokenReader::GetEndOfFileToken()
- {
- return Token(TokenType::EndOfFile, UnownedStringSlice::fromLiteral(""), SourceLoc());
- }
-
- Token* TokenList::begin() const
- {
- SLANG_ASSERT(mTokens.getCount());
- return &mTokens[0];
- }
-
- Token* TokenList::end() const
- {
- SLANG_ASSERT(mTokens.getCount());
- SLANG_ASSERT(mTokens[mTokens.getCount()-1].type == TokenType::EndOfFile);
- return &mTokens[mTokens.getCount() - 1];
- }
-
- TokenSpan::TokenSpan()
- : mBegin(NULL)
- , mEnd (NULL)
- {}
-
- TokenReader::TokenReader()
- : mCursor(NULL)
- , mEnd (NULL)
- {}
-
-
- Token& TokenReader::PeekToken()
- {
- return nextToken;
- }
-
- TokenType TokenReader::PeekTokenType() const
- {
- return nextToken.type;
- }
-
- SourceLoc TokenReader::PeekLoc() const
- {
- return nextToken.loc;
- }
-
- Token TokenReader::AdvanceToken()
- {
- if (!mCursor)
- return GetEndOfFileToken();
-
- Token token = nextToken;
- if (mCursor < mEnd)
- {
- mCursor++;
- nextToken = *mCursor;
- }
- else
- nextToken.type = TokenType::EndOfFile;
- return token;
- }
-
- // Lexer
-
- void Lexer::initialize(
- SourceView* inSourceView,
- DiagnosticSink* inSink,
- NamePool* inNamePool,
- MemoryArena* inMemoryArena)
- {
- sourceView = inSourceView;
- sink = inSink;
- namePool = inNamePool;
- memoryArena = inMemoryArena;
-
- auto content = inSourceView->getContent();
-
- begin = content.begin();
- cursor = content.begin();
- end = content.end();
-
- // Set the start location
- startLoc = inSourceView->getRange().begin;
-
- tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
- lexerFlags = 0;
- }
-
- Lexer::~Lexer()
- {
- }
-
- enum { kEOF = -1 };
-
- // Get the next input byte, without any handling of
- // escaped newlines, non-ASCII code points, source locations, etc.
- static int peekRaw(Lexer* lexer)
- {
- // If we are at the end of the input, return a designated end-of-file value
- if(lexer->cursor == lexer->end)
- return kEOF;
-
- // Otherwise, just look at the next byte
- return *lexer->cursor;
- }
-
- // Read one input byte without any special handling (similar to `peekRaw`)
- static int advanceRaw(Lexer* lexer)
- {
- // The logic here is basically the same as for `peekRaw()`,
- // escape we advance `cursor` if we aren't at the end.
-
- if (lexer->cursor == lexer->end)
- return kEOF;
-
- return *lexer->cursor++;
- }
-
- // When the cursor is already at the first byte of an end-of-line sequence,
- // consume one or two bytes that compose the sequence.
- //
- // Basically, a newline is one of:
- //
- // "\n"
- // "\r"
- // "\r\n"
- // "\n\r"
- //
- // We always look for the longest match possible.
- //
- static void handleNewLineInner(Lexer* lexer, int c)
- {
- SLANG_ASSERT(c == '\n' || c == '\r');
-
- int d = peekRaw(lexer);
- if( (c ^ d) == ('\n' ^ '\r') )
- {
- advanceRaw(lexer);
- }
- }
-
- // Look ahead one code point, dealing with complications like
- // escaped newlines.
- static int peek(Lexer* lexer)
- {
- // Look at the next raw byte, and decide what to do
- int c = peekRaw(lexer);
-
- if(c == '\\')
- {
- // We might have a backslash-escaped newline.
- // Look at the next byte (if any) to see.
- //
- // Note(tfoley): We are assuming a null-terminated input here,
- // so that we can safely look at the next byte without issue.
- int d = lexer->cursor[1];
- switch (d)
- {
- case '\r': case '\n':
- {
- // The newline was escaped, so return the code point after *that*
-
- int e = lexer->cursor[2];
- if ((d ^ e) == ('\r' ^ '\n'))
- return lexer->cursor[3];
- return e;
- }
-
- default:
- break;
- }
- }
- // TODO: handle UTF-8 encoding for non-ASCII code points here
-
- // Default case is to just hand along the byte we read as an ASCII code point.
- return c;
- }
-
- // Get the next code point from the input, and advance the cursor.
- static int advance(Lexer* lexer)
- {
- // We are going to loop, but only as a way of handling
- // escaped line endings.
- for (;;)
- {
- // If we are at the end of the input, then the task is easy.
- if (lexer->cursor == lexer->end)
- return kEOF;
-
- // Look at the next raw byte, and decide what to do
- int c = *lexer->cursor++;
-
- if (c == '\\')
- {
- // We might have a backslash-escaped newline.
- // Look at the next byte (if any) to see.
- //
- // Note(tfoley): We are assuming a null-terminated input here,
- // so that we can safely look at the next byte without issue.
- int d = *lexer->cursor;
- switch (d)
- {
- case '\r': case '\n':
- // handle the end-of-line for our source location tracking
- lexer->cursor++;
- handleNewLineInner(lexer, d);
-
- lexer->tokenFlags |= TokenFlag::ScrubbingNeeded;
-
- // Now try again, looking at the character after the
- // escaped newline.
- continue;
-
- default:
- break;
- }
- }
-
- // TODO: Need to handle non-ASCII code points.
-
- // Default case is to return the raw byte we saw.
- return c;
- }
- }
-
- static void handleNewLine(Lexer* lexer)
- {
- int c = advance(lexer);
- handleNewLineInner(lexer, c);
- }
-
- static void lexLineComment(Lexer* lexer)
- {
- for(;;)
- {
- switch(peek(lexer))
- {
- case '\n': case '\r': case kEOF:
- return;
-
- default:
- advance(lexer);
- continue;
- }
- }
- }
-
- static void lexBlockComment(Lexer* lexer)
- {
- for(;;)
- {
- switch(peek(lexer))
- {
- case kEOF:
- // TODO(tfoley) diagnostic!
- return;
-
- case '\n': case '\r':
- handleNewLine(lexer);
- continue;
-
- case '*':
- advance(lexer);
- switch( peek(lexer) )
- {
- case '/':
- advance(lexer);
- return;
-
- default:
- continue;
- }
-
- default:
- advance(lexer);
- continue;
- }
- }
- }
-
- static void lexHorizontalSpace(Lexer* lexer)
- {
- for(;;)
- {
- switch(peek(lexer))
- {
- case ' ': case '\t':
- advance(lexer);
- continue;
-
- default:
- return;
- }
- }
- }
-
- static void lexIdentifier(Lexer* lexer)
- {
- for(;;)
- {
- int c = peek(lexer);
- if(('a' <= c ) && (c <= 'z')
- || ('A' <= c) && (c <= 'Z')
- || ('0' <= c) && (c <= '9')
- || (c == '_'))
- {
- advance(lexer);
- continue;
- }
-
- return;
- }
- }
-
- static SourceLoc getSourceLoc(Lexer* lexer)
- {
- return lexer->startLoc + (lexer->cursor - lexer->begin);
- }
-
- static void lexDigits(Lexer* lexer, int base)
- {
- for(;;)
- {
- int c = peek(lexer);
-
- int digitVal = 0;
- switch(c)
- {
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- digitVal = c - '0';
- break;
-
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- if(base <= 10) return;
- digitVal = 10 + c - 'a';
- break;
-
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- if(base <= 10) return;
- digitVal = 10 + c - 'A';
- break;
-
- default:
- // Not more digits!
- return;
- }
-
- if(digitVal >= base)
- {
- char buffer[] = { (char) c, 0 };
- lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::invalidDigitForBase, buffer, base);
- }
-
- advance(lexer);
- }
- }
-
- static TokenType maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType)
- {
- // Be liberal in what we accept here, so that figuring out
- // the semantics of a numeric suffix is left up to the parser
- // and semantic checking logic.
- //
- for( ;;)
- {
- int c = peek(lexer);
-
- // Accept any alphanumeric character, plus underscores.
- if(('a' <= c ) && (c <= 'z')
- || ('A' <= c) && (c <= 'Z')
- || ('0' <= c) && (c <= '9')
- || (c == '_'))
- {
- advance(lexer);
- continue;
- }
-
- // Stop at the first character that isn't
- // alphanumeric.
- return tokenType;
- }
- }
-
- static bool isNumberExponent(int c, int base)
- {
- switch( c )
- {
- default:
- return false;
-
- case 'e': case 'E':
- if(base != 10) return false;
- break;
-
- case 'p': case 'P':
- if(base != 16) return false;
- break;
- }
-
- return true;
- }
-
- static bool maybeLexNumberExponent(Lexer* lexer, int base)
- {
- if(!isNumberExponent(peek(lexer), base))
- return false;
-
- // we saw an exponent marker
- advance(lexer);
-
- // Now start to read the exponent
- switch( peek(lexer) )
- {
- case '+': case '-':
- advance(lexer);
- break;
- }
-
- // TODO(tfoley): it would be an error to not see digits here...
-
- lexDigits(lexer, 10);
-
- return true;
- }
-
- static TokenType lexNumberAfterDecimalPoint(Lexer* lexer, int base)
- {
- lexDigits(lexer, base);
- maybeLexNumberExponent(lexer, base);
-
- return maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral);
- }
-
- static TokenType lexNumber(Lexer* lexer, int base)
- {
- // TODO(tfoley): Need to consider whehter to allow any kind of digit separator character.
-
- TokenType tokenType = TokenType::IntegerLiteral;
-
- // At the start of things, we just concern ourselves with digits
- lexDigits(lexer, base);
-
- if( peek(lexer) == '.' )
- {
- tokenType = TokenType::FloatingPointLiteral;
-
- advance(lexer);
- lexDigits(lexer, base);
- }
-
- if( maybeLexNumberExponent(lexer, base))
- {
- tokenType = TokenType::FloatingPointLiteral;
- }
-
- maybeLexNumberSuffix(lexer, tokenType);
- return tokenType;
- }
-
- static int maybeReadDigit(char const** ioCursor, int base)
- {
- auto& cursor = *ioCursor;
-
- for(;;)
- {
- int c = *cursor;
- switch(c)
- {
- default:
- return -1;
-
- // TODO: need to decide on digit separator characters
- case '_':
- cursor++;
- continue;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- cursor++;
- return c - '0';
-
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- if(base > 10)
- {
- cursor++;
- return 10 + c - 'a';
- }
- return -1;
-
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- if(base > 10)
- {
- cursor++;
- return 10 + c - 'A';
- }
- return -1;
- }
- }
- }
-
- static int readOptionalBase(char const** ioCursor)
- {
- auto& cursor = *ioCursor;
- if( *cursor == '0' )
- {
- cursor++;
- switch(*cursor)
- {
- case 'x': case 'X':
- cursor++;
- return 16;
-
- case 'b': case 'B':
- cursor++;
- return 2;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- return 8;
-
- default:
- return 10;
- }
- }
-
- return 10;
- }
-
-
-
- IntegerLiteralValue getIntegerLiteralValue(Token const& token, UnownedStringSlice* outSuffix)
- {
- IntegerLiteralValue value = 0;
-
- char const* cursor = token.Content.begin();
- char const* end = token.Content.end();
-
- int base = readOptionalBase(&cursor);
-
- for( ;;)
- {
- int digit = maybeReadDigit(&cursor, base);
- if(digit < 0)
- break;
-
- value = value*base + digit;
- }
-
- if(outSuffix)
- {
- *outSuffix = UnownedStringSlice(cursor, end);
- }
-
- return value;
- }
-
- FloatingPointLiteralValue getFloatingPointLiteralValue(Token const& token, UnownedStringSlice* outSuffix)
- {
- FloatingPointLiteralValue value = 0;
-
- char const* cursor = token.Content.begin();
- char const* end = token.Content.end();
-
- int radix = readOptionalBase(&cursor);
-
- bool seenDot = false;
- FloatingPointLiteralValue divisor = 1;
- for( ;;)
- {
- if(*cursor == '.')
- {
- cursor++;
- seenDot = true;
- continue;
- }
-
- int digit = maybeReadDigit(&cursor, radix);
- if(digit < 0)
- break;
-
- value = value*radix + digit;
-
- if(seenDot)
- {
- divisor *= radix;
- }
- }
-
- // Now read optional exponent
- if(isNumberExponent(*cursor, radix))
- {
- cursor++;
-
- bool exponentIsNegative = false;
- switch(*cursor)
- {
- default:
- break;
-
- case '-':
- exponentIsNegative = true;
- cursor++;
- break;
-
- case '+':
- cursor++;
- break;
- }
-
- int exponentRadix = 10;
- int exponent = 0;
-
- for(;;)
- {
- int digit = maybeReadDigit(&cursor, exponentRadix);
- if(digit < 0)
- break;
-
- exponent = exponent*exponentRadix + digit;
- }
-
- FloatingPointLiteralValue exponentBase = 10;
- if(radix == 16)
- {
- exponentBase = 2;
- }
-
- FloatingPointLiteralValue exponentValue = pow(exponentBase, exponent);
-
- if( exponentIsNegative )
- {
- divisor *= exponentValue;
- }
- else
- {
- value *= exponentValue;
- }
- }
-
- value /= divisor;
-
- if(outSuffix)
- {
- *outSuffix = UnownedStringSlice(cursor, end);
- }
-
- return value;
- }
-
- static void lexStringLiteralBody(Lexer* lexer, char quote)
- {
- for(;;)
- {
- int c = peek(lexer);
- if(c == quote)
- {
- advance(lexer);
- return;
- }
-
- switch(c)
- {
- case kEOF:
- lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::endOfFileInLiteral);
- return;
-
- case '\n': case '\r':
- lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::newlineInLiteral);
- return;
-
- case '\\':
- // Need to handle various escape sequence cases
- advance(lexer);
- switch(peek(lexer))
- {
- case '\'':
- case '\"':
- case '\\':
- case '?':
- case 'a':
- case 'b':
- case 'f':
- case 'n':
- case 'r':
- case 't':
- case 'v':
- advance(lexer);
- break;
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7':
- // octal escape: up to 3 characters
- advance(lexer);
- for(int ii = 0; ii < 3; ++ii)
- {
- int d = peek(lexer);
- if(('0' <= d) && (d <= '7'))
- {
- advance(lexer);
- continue;
- }
- else
- {
- break;
- }
- }
- break;
-
- case 'x':
- // hexadecimal escape: any number of characters
- advance(lexer);
- for(;;)
- {
- int d = peek(lexer);
- if(('0' <= d) && (d <= '9')
- || ('a' <= d) && (d <= 'f')
- || ('A' <= d) && (d <= 'F'))
- {
- advance(lexer);
- continue;
- }
- else
- {
- break;
- }
- }
- break;
-
- // TODO: Unicode escape sequences
-
- }
- break;
-
- default:
- advance(lexer);
- continue;
- }
- }
- }
-
- String getStringLiteralTokenValue(Token const& token)
- {
- SLANG_ASSERT(token.type == TokenType::StringLiteral
- || token.type == TokenType::CharLiteral);
-
- char const* cursor = token.Content.begin();
- char const* end = token.Content.end();
- SLANG_UNREFERENCED_VARIABLE(end);
-
- auto quote = *cursor++;
- SLANG_ASSERT(quote == '\'' || quote == '"');
-
- StringBuilder valueBuilder;
- for(;;)
- {
- SLANG_ASSERT(cursor != end);
-
- auto c = *cursor++;
-
- // If we see a closing quote, then we are at the end of the string literal
- if(c == quote)
- {
- SLANG_ASSERT(cursor == end);
- return valueBuilder.ProduceString();
- }
-
- // Characters that don't being escape sequences are easy;
- // just append them to the buffer and move on.
- if(c != '\\')
- {
- valueBuilder.Append(c);
- continue;
- }
-
- // Now we look at another character to figure out the kind of
- // escape sequence we are dealing with:
-
- char d = *cursor++;
-
- switch(d)
- {
- // Simple characters that just needed to be escaped
- case '\'':
- case '\"':
- case '\\':
- case '?':
- valueBuilder.Append(d);
- continue;
-
- // Traditional escape sequences for special characters
- case 'a': valueBuilder.Append('\a'); continue;
- case 'b': valueBuilder.Append('\b'); continue;
- case 'f': valueBuilder.Append('\f'); continue;
- case 'n': valueBuilder.Append('\n'); continue;
- case 'r': valueBuilder.Append('\r'); continue;
- case 't': valueBuilder.Append('\t'); continue;
- case 'v': valueBuilder.Append('\v'); continue;
-
- // Octal escape: up to 3 characterws
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7':
- {
- cursor--;
- int value = 0;
- for(int ii = 0; ii < 3; ++ii)
- {
- d = *cursor;
- if(('0' <= d) && (d <= '7'))
- {
- value = value*8 + (d - '0');
-
- cursor++;
- continue;
- }
- else
- {
- break;
- }
- }
-
- // TODO: add support for appending an arbitrary code point?
- valueBuilder.Append((char) value);
- }
- continue;
-
- // Hexadecimal escape: any number of characters
- case 'x':
- {
- cursor--;
- int value = 0;
- for(;;)
- {
- d = *cursor++;
- int digitValue = 0;
- if(('0' <= d) && (d <= '9'))
- {
- digitValue = d - '0';
- }
- else if( ('a' <= d) && (d <= 'f') )
- {
- digitValue = d - 'a';
- }
- else if( ('A' <= d) && (d <= 'F') )
- {
- digitValue = d - 'A';
- }
- else
- {
- cursor--;
- break;
- }
-
- value = value*16 + digitValue;
- }
-
- // TODO: add support for appending an arbitrary code point?
- valueBuilder.Append((char) value);
- }
- continue;
-
- // TODO: Unicode escape sequences
-
- }
- }
- }
-
- String getFileNameTokenValue(Token const& token)
- {
- // A file name usually doesn't process escape sequences
- // (this is import on Windows, where `\\` is a valid
- // path separator character).
-
- // Just trim off the first and last characters to remove the quotes
- // (whether they were `""` or `<>`.
- return String(token.Content.begin() + 1, token.Content.end() - 1);
- }
-
-
-
- static TokenType lexTokenImpl(Lexer* lexer, LexerFlags effectiveFlags)
- {
- if(effectiveFlags & kLexerFlag_ExpectDirectiveMessage)
- {
- for(;;)
- {
- switch(peek(lexer))
- {
- default:
- advance(lexer);
- continue;
-
- case kEOF: case '\r': case '\n':
- break;
- }
- break;
- }
- return TokenType::DirectiveMessage;
- }
-
- switch(peek(lexer))
- {
- default:
- break;
-
- case kEOF:
- if((effectiveFlags & kLexerFlag_InDirective) != 0)
- return TokenType::EndOfDirective;
- return TokenType::EndOfFile;
-
- case '\r': case '\n':
- if((effectiveFlags & kLexerFlag_InDirective) != 0)
- return TokenType::EndOfDirective;
- handleNewLine(lexer);
- return TokenType::NewLine;
-
- case ' ': case '\t':
- lexHorizontalSpace(lexer);
- return TokenType::WhiteSpace;
-
- case '.':
- advance(lexer);
- switch(peek(lexer))
- {
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- return lexNumberAfterDecimalPoint(lexer, 10);
-
- // TODO(tfoley): handle ellipsis (`...`)
-
- default:
- return TokenType::Dot;
- }
-
- case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- return lexNumber(lexer, 10);
-
- case '0':
- {
- auto loc = getSourceLoc(lexer);
- advance(lexer);
- switch(peek(lexer))
- {
- default:
- return maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral);
-
- case '.':
- advance(lexer);
- return lexNumberAfterDecimalPoint(lexer, 10);
-
- case 'x': case 'X':
- advance(lexer);
- return lexNumber(lexer, 16);
-
- case 'b': case 'B':
- advance(lexer);
- return lexNumber(lexer, 2);
-
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- lexer->sink->diagnose(loc, Diagnostics::octalLiteral);
- return lexNumber(lexer, 8);
- }
- }
-
- case 'a': case 'b': case 'c': case 'd': case 'e':
- case 'f': case 'g': case 'h': case 'i': case 'j':
- case 'k': case 'l': case 'm': case 'n': case 'o':
- case 'p': case 'q': case 'r': case 's': case 't':
- case 'u': case 'v': case 'w': case 'x': case 'y':
- case 'z':
- case 'A': case 'B': case 'C': case 'D': case 'E':
- case 'F': case 'G': case 'H': case 'I': case 'J':
- case 'K': case 'L': case 'M': case 'N': case 'O':
- case 'P': case 'Q': case 'R': case 'S': case 'T':
- case 'U': case 'V': case 'W': case 'X': case 'Y':
- case 'Z':
- case '_':
- lexIdentifier(lexer);
- return TokenType::Identifier;
-
- case '\"':
- advance(lexer);
- lexStringLiteralBody(lexer, '\"');
- return TokenType::StringLiteral;
-
- case '\'':
- advance(lexer);
- lexStringLiteralBody(lexer, '\'');
- return TokenType::CharLiteral;
-
- case '+':
- advance(lexer);
- switch(peek(lexer))
- {
- case '+': advance(lexer); return TokenType::OpInc;
- case '=': advance(lexer); return TokenType::OpAddAssign;
- default:
- return TokenType::OpAdd;
- }
-
- case '-':
- advance(lexer);
- switch(peek(lexer))
- {
- case '-': advance(lexer); return TokenType::OpDec;
- case '=': advance(lexer); return TokenType::OpSubAssign;
- case '>': advance(lexer); return TokenType::RightArrow;
- default:
- return TokenType::OpSub;
- }
-
- case '*':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpMulAssign;
- default:
- return TokenType::OpMul;
- }
-
- case '/':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpDivAssign;
- case '/': advance(lexer); lexLineComment(lexer); return TokenType::LineComment;
- case '*': advance(lexer); lexBlockComment(lexer); return TokenType::BlockComment;
- default:
- return TokenType::OpDiv;
- }
-
- case '%':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpModAssign;
- default:
- return TokenType::OpMod;
- }
-
- case '|':
- advance(lexer);
- switch(peek(lexer))
- {
- case '|': advance(lexer); return TokenType::OpOr;
- case '=': advance(lexer); return TokenType::OpOrAssign;
- default:
- return TokenType::OpBitOr;
- }
-
- case '&':
- advance(lexer);
- switch(peek(lexer))
- {
- case '&': advance(lexer); return TokenType::OpAnd;
- case '=': advance(lexer); return TokenType::OpAndAssign;
- default:
- return TokenType::OpBitAnd;
- }
-
- case '^':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpXorAssign;
- default:
- return TokenType::OpBitXor;
- }
-
- case '>':
- advance(lexer);
- switch(peek(lexer))
- {
- case '>':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpShrAssign;
- default: return TokenType::OpRsh;
- }
- case '=': advance(lexer); return TokenType::OpGeq;
- default:
- return TokenType::OpGreater;
- }
-
- case '<':
- advance(lexer);
- switch(peek(lexer))
- {
- case '<':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpShlAssign;
- default: return TokenType::OpLsh;
- }
- case '=': advance(lexer); return TokenType::OpLeq;
- default:
- return TokenType::OpLess;
- }
-
- case '=':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpEql;
- default:
- return TokenType::OpAssign;
- }
-
- case '!':
- advance(lexer);
- switch(peek(lexer))
- {
- case '=': advance(lexer); return TokenType::OpNeq;
- default:
- return TokenType::OpNot;
- }
-
- case '#':
- advance(lexer);
- switch(peek(lexer))
- {
- case '#': advance(lexer); return TokenType::PoundPound;
- default:
- return TokenType::Pound;
- }
-
- case '~': advance(lexer); return TokenType::OpBitNot;
-
- case ':':
- {
- advance(lexer);
- if (peek(lexer) == ':')
- {
- advance(lexer);
- return TokenType::Scope;
- }
- return TokenType::Colon;
- }
- case ';': advance(lexer); return TokenType::Semicolon;
- case ',': advance(lexer); return TokenType::Comma;
-
- case '{': advance(lexer); return TokenType::LBrace;
- case '}': advance(lexer); return TokenType::RBrace;
- case '[': advance(lexer); return TokenType::LBracket;
- case ']': advance(lexer); return TokenType::RBracket;
- case '(': advance(lexer); return TokenType::LParent;
- case ')': advance(lexer); return TokenType::RParent;
-
- case '?': advance(lexer); return TokenType::QuestionMark;
- case '@': advance(lexer); return TokenType::At;
- case '$': advance(lexer); return TokenType::Dollar;
-
- }
-
- // TODO(tfoley): If we ever wanted to support proper Unicode
- // in identifiers, etc., then this would be the right place
- // to perform a more expensive dispatch based on the actual
- // code point (and not just the first byte).
-
- {
- // If none of the above cases matched, then we have an
- // unexpected/invalid character.
-
- auto loc = getSourceLoc(lexer);
- int c = advance(lexer);
- if(!(effectiveFlags & kLexerFlag_IgnoreInvalid))
- {
- auto sink = lexer->sink;
- if(c >= 0x20 && c <= 0x7E)
- {
- char buffer[] = { (char) c, 0 };
- sink->diagnose(loc, Diagnostics::illegalCharacterPrint, buffer);
- }
- else
- {
- // Fallback: print as hexadecimal
- sink->diagnose(loc, Diagnostics::illegalCharacterHex, String((unsigned char)c, 16));
- }
- }
-
- return TokenType::Invalid;
- }
- }
-
- Token Lexer::lexToken(LexerFlags extraFlags)
- {
- auto& flags = this->tokenFlags;
- for(;;)
- {
- Token token;
- token.loc = getSourceLoc(this);
-
- char const* textBegin = cursor;
-
- auto tokenType = lexTokenImpl(this, this->lexerFlags | extraFlags);
-
- // The low-level lexer produces tokens for things we want
- // to ignore, such as white space, so we skip them here.
- switch(tokenType)
- {
- case TokenType::Invalid:
- flags = 0;
- continue;
-
- case TokenType::NewLine:
- flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
- continue;
-
- case TokenType::WhiteSpace:
- case TokenType::LineComment:
- case TokenType::BlockComment:
- flags |= TokenFlag::AfterWhitespace;
- continue;
-
- // We don't want to skip the end-of-file token, but we *do*
- // want to make sure it has appropriate flags to make our life easier
- case TokenType::EndOfFile:
- flags |= TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
- break;
-
- // We will also do some book-keeping around preprocessor directives here:
- //
- // If we see a `#` at the start of a line, then we are entering a
- // preprocessor directive.
- case TokenType::Pound:
- if((flags & TokenFlag::AtStartOfLine) != 0)
- lexerFlags |= kLexerFlag_InDirective;
- break;
- //
- // And if we saw an end-of-line during a directive, then we are
- // now leaving that directive.
- //
- case TokenType::EndOfDirective:
- lexerFlags &= ~kLexerFlag_InDirective;
- break;
-
- default:
- break;
- }
-
- token.type = tokenType;
-
- char const* textEnd = cursor;
-
- // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes
- if(textEnd != textBegin)
- {
- // "scrubbing" token value here to remove escaped newlines...
- //
- // Only perform this work if we encountered an escaped newline
- // while lexing this token (e.g., keep a flag on the lexer), or
- // do it on-demand when the actual value of the token is needed.
- if (tokenFlags & TokenFlag::ScrubbingNeeded)
- {
- // Allocate space that will always be more than enough for stripped contents
- char* startDst = (char*)memoryArena->allocateUnaligned(textEnd - textBegin);
- char* dst = startDst;
-
- auto tt = textBegin;
- while (tt != textEnd)
- {
- char c = *tt++;
- if (c == '\\')
- {
- char d = *tt;
- switch (d)
- {
- case '\r': case '\n':
- {
- tt++;
- char e = *tt;
- if ((d ^ e) == ('\r' ^ '\n'))
- {
- tt++;
- }
- }
- continue;
-
- default:
- break;
- }
- }
- *dst++ = c;
- }
- token.Content = UnownedStringSlice(startDst, dst);
- }
- else
- {
- token.Content = UnownedStringSlice(textBegin, textEnd);
- }
- }
-
- token.flags = flags;
-
- this->tokenFlags = 0;
-
- if (tokenType == TokenType::Identifier)
- {
- token.ptrValue = this->namePool->getName(token.Content);
- }
-
- return token;
- }
- }
-
- TokenList Lexer::lexAllTokens()
- {
- TokenList tokenList;
- for(;;)
- {
- Token token = lexToken();
- tokenList.mTokens.add(token);
-
- if(token.type == TokenType::EndOfFile)
- return tokenList;
- }
- }
-}