summaryrefslogtreecommitdiff
path: root/source/compiler-core
diff options
context:
space:
mode:
Diffstat (limited to 'source/compiler-core')
-rw-r--r--source/compiler-core/slang-lexer.cpp256
-rw-r--r--source/compiler-core/slang-lexer.h54
-rw-r--r--source/compiler-core/slang-token-defs.h4
-rw-r--r--source/compiler-core/slang-token.h5
4 files changed, 165 insertions, 154 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index ab60edd97..653c43dba 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -38,8 +38,9 @@ namespace Slang
TokenReader::TokenReader()
: m_cursor(nullptr)
, m_end (nullptr)
- {}
-
+ {
+ _updateLookaheadToken();
+ }
Token& TokenReader::peekToken()
{
@@ -58,18 +59,33 @@ namespace Slang
Token TokenReader::advanceToken()
{
- if (!m_cursor)
- return getEndOfFileToken();
-
- Token token = m_nextToken;
- if (m_cursor < m_end)
- {
+ Token result = m_nextToken;
+ if (m_cursor != m_end)
m_cursor++;
- m_nextToken = *m_cursor;
- }
- else
+ _updateLookaheadToken();
+ return result;
+ }
+
+ void TokenReader::_updateLookaheadToken()
+ {
+ // We assume here that we can read a token from a non-null `m_cursor`
+ // *even* in the case where `m_cursor == m_end`, because the invariant
+ // for lists of tokens is that they should be terminated with and
+ // end-of-file token, so that there is always a token "one past the end."
+ //
+ m_nextToken = m_cursor ? *m_cursor : getEndOfFileToken();
+
+ // If the token we read came from the end of the sub-sequence we are
+ // reading, then we will change the token type to an end-of-file token
+ // so that code that reads from the sequence and expects a terminating
+ // EOF will find it.
+ //
+ // TODO: We might eventually want a way to look at the actual token type
+ // and not just use EOF in all cases: e.g., when emitting diagnostic
+ // messages that include the token that is seen.
+ //
+ if(m_cursor == m_end)
m_nextToken.type = TokenType::EndOfFile;
- return token;
}
// Lexer
@@ -78,8 +94,7 @@ namespace Slang
SourceView* sourceView,
DiagnosticSink* sink,
NamePool* namePool,
- MemoryArena* memoryArena,
- OptionFlags optionFlags)
+ MemoryArena* memoryArena)
{
m_sourceView = sourceView;
m_sink = sink;
@@ -95,9 +110,12 @@ namespace Slang
// Set the start location
m_startLoc = sourceView->getRange().begin;
+ // The first token read from a translation unit should be considered to be at
+ // the start of a line, and *also* as coming after whitespace (conceptually
+ // both the end-of-file and beginning-of-file pseudo-tokens are whitespace).
+ //
m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
m_lexerFlags = 0;
- m_optionFlags = optionFlags;
}
Lexer::~Lexer()
@@ -331,7 +349,7 @@ namespace Slang
return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin);
}
- static void _lexDigits(Lexer* lexer, int base, LexerFlags flags)
+ static void _lexDigits(Lexer* lexer, int base)
{
for(;;)
{
@@ -362,7 +380,7 @@ namespace Slang
if(digitVal >= base)
{
- if (auto sink = lexer->getDiagnosticSink(flags))
+ if (auto sink = lexer->getDiagnosticSink())
{
char buffer[] = { (char) c, 0 };
sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::invalidDigitForBase, buffer, base);
@@ -418,7 +436,7 @@ namespace Slang
return true;
}
- static bool _maybeLexNumberExponent(Lexer* lexer, int base, LexerFlags flags)
+ static bool _maybeLexNumberExponent(Lexer* lexer, int base)
{
if(!_isNumberExponent(_peek(lexer), base))
return false;
@@ -436,37 +454,37 @@ namespace Slang
// TODO(tfoley): it would be an error to not see digits here...
- _lexDigits(lexer, 10, flags);
+ _lexDigits(lexer, 10);
return true;
}
- static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base, LexerFlags flags)
+ static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base)
{
- _lexDigits(lexer, base, flags);
- _maybeLexNumberExponent(lexer, base, flags);
+ _lexDigits(lexer, base);
+ _maybeLexNumberExponent(lexer, base);
return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral);
}
- static TokenType _lexNumber(Lexer* lexer, int base, LexerFlags flags)
+ static TokenType _lexNumber(Lexer* lexer, int base)
{
// TODO(tfoley): Need to consider whether to allow any kind of digit separator character.
TokenType tokenType = TokenType::IntegerLiteral;
// At the start of things, we just concern ourselves with digits
- _lexDigits(lexer, base, flags);
+ _lexDigits(lexer, base);
if( _peek(lexer) == '.' )
{
tokenType = TokenType::FloatingPointLiteral;
_advance(lexer);
- _lexDigits(lexer, base, flags);
+ _lexDigits(lexer, base);
}
- if( _maybeLexNumberExponent(lexer, base, flags))
+ if( _maybeLexNumberExponent(lexer, base))
{
tokenType = TokenType::FloatingPointLiteral;
}
@@ -669,7 +687,7 @@ namespace Slang
return value;
}
- static void _lexStringLiteralBody(Lexer* lexer, char quote, LexerFlags flags)
+ static void _lexStringLiteralBody(Lexer* lexer, char quote)
{
for(;;)
{
@@ -683,14 +701,14 @@ namespace Slang
switch(c)
{
case kEOF:
- if (auto sink = lexer->getDiagnosticSink(flags))
+ if (auto sink = lexer->getDiagnosticSink())
{
sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral);
}
return;
case '\n': case '\r':
- if (auto sink = lexer->getDiagnosticSink(flags))
+ if (auto sink = lexer->getDiagnosticSink())
{
sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral);
}
@@ -907,41 +925,17 @@ namespace Slang
return String(content.begin() + 1, content.end() - 1);
}
-
-
- static TokenType _lexTokenImpl(Lexer* lexer, LexerFlags effectiveFlags)
+ static TokenType _lexTokenImpl(Lexer* lexer)
{
- if(effectiveFlags & kLexerFlag_ExpectDirectiveMessage)
- {
- for(;;)
- {
- switch(_peek(lexer))
- {
- default:
- _advance(lexer);
- continue;
-
- case kEOF: case '\r': case '\n':
- break;
- }
- break;
- }
- return TokenType::DirectiveMessage;
- }
-
switch(_peek(lexer))
{
default:
break;
case kEOF:
- if((effectiveFlags & kLexerFlag_InDirective) != 0)
- return TokenType::EndOfDirective;
return TokenType::EndOfFile;
case '\r': case '\n':
- if((effectiveFlags & kLexerFlag_InDirective) != 0)
- return TokenType::EndOfDirective;
_handleNewLine(lexer);
return TokenType::NewLine;
@@ -955,7 +949,7 @@ namespace Slang
{
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- return _lexNumberAfterDecimalPoint(lexer, 10, effectiveFlags);
+ return _lexNumberAfterDecimalPoint(lexer, 10);
// TODO(tfoley): handle ellipsis (`...`)
@@ -965,7 +959,7 @@ namespace Slang
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- return _lexNumber(lexer, 10, effectiveFlags);
+ return _lexNumber(lexer, 10);
case '0':
{
@@ -978,23 +972,23 @@ namespace Slang
case '.':
_advance(lexer);
- return _lexNumberAfterDecimalPoint(lexer, 10, effectiveFlags);
+ return _lexNumberAfterDecimalPoint(lexer, 10);
case 'x': case 'X':
_advance(lexer);
- return _lexNumber(lexer, 16, effectiveFlags);
+ return _lexNumber(lexer, 16);
case 'b': case 'B':
_advance(lexer);
- return _lexNumber(lexer, 2, effectiveFlags);
+ return _lexNumber(lexer, 2);
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- if (auto sink = lexer->getDiagnosticSink(effectiveFlags))
+ if (auto sink = lexer->getDiagnosticSink())
{
sink->diagnose(loc, LexerDiagnostics::octalLiteral);
}
- return _lexNumber(lexer, 8, effectiveFlags);
+ return _lexNumber(lexer, 8);
}
}
@@ -1016,12 +1010,12 @@ namespace Slang
case '\"':
_advance(lexer);
- _lexStringLiteralBody(lexer, '\"', effectiveFlags);
+ _lexStringLiteralBody(lexer, '\"');
return TokenType::StringLiteral;
case '\'':
_advance(lexer);
- _lexStringLiteralBody(lexer, '\'', effectiveFlags);
+ _lexStringLiteralBody(lexer, '\'');
return TokenType::CharLiteral;
case '+':
@@ -1202,7 +1196,7 @@ namespace Slang
auto loc = _getSourceLoc(lexer);
int c = _advance(lexer);
- if (auto sink = lexer->getDiagnosticSink(effectiveFlags))
+ if (auto sink = lexer->getDiagnosticSink())
{
if(c >= 0x20 && c <= 0x7E)
{
@@ -1220,9 +1214,8 @@ namespace Slang
}
}
- Token Lexer::lexToken(LexerFlags extraFlags)
+ Token Lexer::lexToken()
{
- auto& flags = m_tokenFlags;
for(;;)
{
Token token;
@@ -1230,73 +1223,54 @@ namespace Slang
char const* textBegin = m_cursor;
- auto tokenType = _lexTokenImpl(this, m_lexerFlags | extraFlags);
+ auto tokenType = _lexTokenImpl(this);
- // The low-level lexer produces tokens for things we want
- // to ignore, such as white space, so we skip them here.
+ // The flags on the token we just lexed will be based
+ // on the current state of the lexer.
+ //
+ auto tokenFlags = m_tokenFlags;
+ //
+ // Depending on what kind of token we just lexed, the
+ // flags that will be used for the *next* token might
+ // need to be updated.
+ //
switch(tokenType)
{
- case TokenType::Invalid:
- flags = 0;
- continue;
-
case TokenType::NewLine:
- flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
- continue;
+ {
+ // If we just reached the end of a line, then the next token
+ // should count as being at the start of a line, and also after
+ // whitespace.
+ //
+ m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
+ break;
+ }
case TokenType::WhiteSpace:
- {
- flags |= TokenFlag::AfterWhitespace;
- continue;
- }
case TokenType::BlockComment:
case TokenType::LineComment:
- {
- flags |= TokenFlag::AfterWhitespace;
- if (m_optionFlags & OptionFlag::TokenizeComments)
{
- // We don't break here, and use the normal token adding logic
- // because we want the behavior to be identical (in terms of flags etc)
- // as if TokenizeComments is not enabled
- char const* textEnd = m_cursor;
-
- token.type = tokenType;
- token.flags = m_tokenFlags;
- token.setContent(UnownedStringSlice(textBegin, textEnd));
-
- return token;
+ // True horizontal whitespace and comments both count as whitespace.
+ //
+ // Note that a line comment does not include the terminating newline,
+ // we do not need to set `AtStartOfLine` here.
+ //
+ m_tokenFlags |= TokenFlag::AfterWhitespace;
+ break;
}
-
- continue;
- }
- // We don't want to skip the end-of-file token, but we *do*
- // want to make sure it has appropriate flags to make our life easier
- case TokenType::EndOfFile:
- flags |= TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
- break;
-
- // We will also do some book-keeping around preprocessor directives here:
- //
- // If we see a `#` at the start of a line, then we are entering a
- // preprocessor directive.
- case TokenType::Pound:
- if((flags & TokenFlag::AtStartOfLine) != 0)
- m_lexerFlags |= kLexerFlag_InDirective;
- break;
- //
- // And if we saw an end-of-line during a directive, then we are
- // now leaving that directive.
- //
- case TokenType::EndOfDirective:
- m_lexerFlags &= ~kLexerFlag_InDirective;
- break;
-
default:
- break;
+ {
+ // If we read some token other then the above cases, then we are
+ // neither after whitespace nor at the start of a line.
+ //
+ m_tokenFlags = 0;
+ break;
+ }
}
token.type = tokenType;
+ token.flags = tokenFlags;
char const* textEnd = m_cursor;
@@ -1308,7 +1282,7 @@ namespace Slang
// Only perform this work if we encountered an escaped newline
// while lexing this token (e.g., keep a flag on the lexer), or
// do it on-demand when the actual value of the token is needed.
- if (m_tokenFlags & TokenFlag::ScrubbingNeeded)
+ if (tokenFlags & TokenFlag::ScrubbingNeeded)
{
// Allocate space that will always be more than enough for stripped contents
char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin);
@@ -1348,10 +1322,6 @@ namespace Slang
}
}
- token.flags = flags;
-
- m_tokenFlags = 0;
-
if (tokenType == TokenType::Identifier)
{
token.setName(m_namePool->getName(token.getContent()));
@@ -1361,14 +1331,52 @@ namespace Slang
}
}
- TokenList Lexer::lexAllTokens()
+ TokenList Lexer::lexAllSemanticTokens()
{
TokenList tokenList;
for(;;)
{
Token token = lexToken();
+
+ // We are only interested intokens that are semantically
+ // significant, so we will skip over forms of whitespace
+ // and comments.
+ //
+ switch( token.type )
+ {
+ default:
+ break;
+
+ case TokenType::WhiteSpace:
+ case TokenType::BlockComment:
+ case TokenType::LineComment:
+ case TokenType::NewLine:
+ continue;
+ }
+
tokenList.add(token);
+ if(token.type == TokenType::EndOfFile)
+ return tokenList;
+ }
+ }
+
+ TokenList Lexer::lexAllMarkupTokens()
+ {
+ TokenList tokenList;
+ for(;;)
+ {
+ Token token = lexToken();
+ switch( token.type )
+ {
+ default:
+ break;
+ case TokenType::WhiteSpace:
+ case TokenType::NewLine:
+ continue;
+ }
+
+ tokenList.add(token);
if(token.type == TokenType::EndOfFile)
return tokenList;
}
diff --git a/source/compiler-core/slang-lexer.h b/source/compiler-core/slang-lexer.h
index 3c8d4ca37..23458d396 100644
--- a/source/compiler-core/slang-lexer.h
+++ b/source/compiler-core/slang-lexer.h
@@ -45,13 +45,21 @@ namespace Slang
explicit TokenReader(TokenSpan const& tokens)
: m_cursor(tokens.begin())
, m_end (tokens.end ())
- , m_nextToken(tokens.begin() ? *tokens.begin() : getEndOfFileToken())
- {}
+ {
+ _updateLookaheadToken();
+ }
explicit TokenReader(TokenList const& tokens)
: m_cursor(tokens.begin())
, m_end (tokens.end ())
- , m_nextToken(tokens.begin() ? *tokens.begin() : getEndOfFileToken())
- {}
+ {
+ _updateLookaheadToken();
+ }
+ explicit TokenReader(Token const* begin, Token const* end)
+ : m_cursor(begin)
+ , m_end (end)
+ {
+ _updateLookaheadToken();
+ }
struct ParsingCursor
{
Token nextToken;
@@ -85,34 +93,25 @@ namespace Slang
const Token* m_cursor;
const Token* m_end;
static Token getEndOfFileToken();
+
+ private:
+ /// Update the lookahead token in `m_nextToken` to reflect the cursor state
+ void _updateLookaheadToken();
};
typedef unsigned int LexerFlags;
enum
{
- kLexerFlag_InDirective = 1 << 0, ///< Turn end-of-line and end-of-file into end-of-directive
- kLexerFlag_ExpectFileName = 1 << 1, ///< Support `<>` style strings for file paths
- kLexerFlag_IgnoreInvalid = 1 << 2, ///< Suppress errors about invalid/unsupported characters
- kLexerFlag_ExpectDirectiveMessage = 1 << 3, ///< Don't lexer ordinary tokens, and instead consume rest of line as a string
+ kLexerFlag_SuppressDiagnostics = 1 << 2, ///< Suppress errors about invalid/unsupported characters
};
struct Lexer
{
- typedef uint32_t OptionFlags;
- struct OptionFlag
- {
- enum Enum : OptionFlags
- {
- TokenizeComments = 1 << 0, ///< If set comments will be output to the token stream
- };
- };
-
void initialize(
SourceView* sourceView,
DiagnosticSink* sink,
NamePool* namePool,
- MemoryArena* memoryArena,
- OptionFlags optionFlags = 0);
+ MemoryArena* memoryArena);
~Lexer();
@@ -126,12 +125,20 @@ namespace Slang
/// not needed by the DiagnosticSink.
static UnownedStringSlice sourceLocationLexer(const UnownedStringSlice& in);
- Token lexToken(LexerFlags extraFlags = 0);
+ /// Lex the next token in the input stream, returning an EOF token if at end.
+ Token lexToken();
- TokenList lexAllTokens();
+ /// Lex all tokens (up to the end of the stream) that are semantically relevant
+ TokenList lexAllSemanticTokens();
- /// Get the diagnostic sink, taking into account flags. Can return nullptr if ignoring invalid
- DiagnosticSink* getDiagnosticSink(LexerFlags flags) { return ((flags & kLexerFlag_IgnoreInvalid) == 0) ? m_sink : nullptr; }
+ /// Lex all tokens (up to the end of the stream) that are relevant to things like markup
+ TokenList lexAllMarkupTokens();
+
+ /// Get the diagnostic sink, taking into account flags. Will return null if suppressing diagnostics.
+ DiagnosticSink* getDiagnosticSink()
+ {
+ return ((m_lexerFlags & kLexerFlag_SuppressDiagnostics) == 0) ? m_sink : nullptr;
+ }
SourceView* m_sourceView;
DiagnosticSink* m_sink;
@@ -147,7 +154,6 @@ namespace Slang
TokenFlags m_tokenFlags;
LexerFlags m_lexerFlags;
- OptionFlags m_optionFlags;
MemoryArena* m_memoryArena;
};
diff --git a/source/compiler-core/slang-token-defs.h b/source/compiler-core/slang-token-defs.h
index 6cece330e..485429e28 100644
--- a/source/compiler-core/slang-token-defs.h
+++ b/source/compiler-core/slang-token-defs.h
@@ -18,7 +18,6 @@
TOKEN(Unknown, "<unknown>")
TOKEN(EndOfFile, "end of file")
-TOKEN(EndOfDirective, "end of line")
TOKEN(Invalid, "invalid character")
TOKEN(Identifier, "identifier")
TOKEN(IntegerLiteral, "integer literal")
@@ -26,10 +25,9 @@ TOKEN(FloatingPointLiteral, "floating-point literal")
TOKEN(StringLiteral, "string literal")
TOKEN(CharLiteral, "character literal")
TOKEN(WhiteSpace, "whitespace")
-TOKEN(NewLine, "newline")
+TOKEN(NewLine, "end of line")
TOKEN(LineComment, "line comment")
TOKEN(BlockComment, "block comment")
-TOKEN(DirectiveMessage, "user-defined message")
#define PUNCTUATION(id, text) \
TOKEN(id, "'" text "'")
diff --git a/source/compiler-core/slang-token.h b/source/compiler-core/slang-token.h
index 9697a5c2d..7feda6824 100644
--- a/source/compiler-core/slang-token.h
+++ b/source/compiler-core/slang-token.h
@@ -26,9 +26,8 @@ struct TokenFlag
{
AtStartOfLine = 1 << 0,
AfterWhitespace = 1 << 1,
- SuppressMacroExpansion = 1 << 2,
- ScrubbingNeeded = 1 << 3,
- Name = 1 << 4, ///< Determines if 'name' is set or 'chars' in the charsNameUnion
+ ScrubbingNeeded = 1 << 2,
+ Name = 1 << 3, ///< Determines if 'name' is set or 'chars' in the charsNameUnion
};
};