4 files changed, 165 insertions, 154 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index ab60edd97..653c43dba 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -38,8 +38,9 @@ namespace Slang
     TokenReader::TokenReader()
         : m_cursor(nullptr)
         , m_end   (nullptr)
-    {}
-
+    {
+        _updateLookaheadToken();
+    }
 
     Token& TokenReader::peekToken()
     {
@@ -58,18 +59,33 @@ namespace Slang
 
     Token TokenReader::advanceToken()
     {
-        if (!m_cursor)
-            return getEndOfFileToken();
-
-        Token token = m_nextToken;
-        if (m_cursor < m_end)
-        {
+        Token result = m_nextToken;
+        if (m_cursor != m_end)
             m_cursor++;
-            m_nextToken = *m_cursor;
-        }
-        else
+        _updateLookaheadToken();
+        return result;
+    }
+
+    void TokenReader::_updateLookaheadToken()
+    {
+        // We assume here that we can read a token from a non-null `m_cursor`
+        // *even* in the case where `m_cursor == m_end`, because the invariant
+        // for lists of tokens is that they should be terminated with and
+        // end-of-file token, so that there is always a token "one past the end."
+        //
+        m_nextToken = m_cursor ? *m_cursor : getEndOfFileToken();
+
+        // If the token we read came from the end of the sub-sequence we are
+        // reading, then we will change the token type to an end-of-file token
+        // so that code that reads from the sequence and expects a terminating
+        // EOF will find it.
+        //
+        // TODO: We might eventually want a way to look at the actual token type
+        // and not just use EOF in all cases: e.g., when emitting diagnostic
+        // messages that include the token that is seen.
+        // 
+        if(m_cursor == m_end)
             m_nextToken.type = TokenType::EndOfFile;
-        return token;
     }
 
     // Lexer
@@ -78,8 +94,7 @@ namespace Slang
         SourceView*     sourceView,
         DiagnosticSink* sink,
         NamePool*       namePool,
-        MemoryArena*    memoryArena,
-        OptionFlags     optionFlags)
+        MemoryArena*    memoryArena)
     {
         m_sourceView  = sourceView;
         m_sink        = sink;
@@ -95,9 +110,12 @@ namespace Slang
         // Set the start location
         m_startLoc = sourceView->getRange().begin;
 
+        // The first token read from a translation unit should be considered to be at
+        // the start of a line, and *also* as coming after whitespace (conceptually
+        // both the end-of-file and beginning-of-file pseudo-tokens are whitespace).
+        //
         m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
         m_lexerFlags = 0;
-        m_optionFlags = optionFlags;
     }
 
     Lexer::~Lexer()
@@ -331,7 +349,7 @@ namespace Slang
         return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin);
     }
 
-    static void _lexDigits(Lexer* lexer, int base, LexerFlags flags)
+    static void _lexDigits(Lexer* lexer, int base)
     {
         for(;;)
         {
@@ -362,7 +380,7 @@ namespace Slang
 
             if(digitVal >= base)
             {
-                if (auto sink = lexer->getDiagnosticSink(flags))
+                if (auto sink = lexer->getDiagnosticSink())
                 {
                     char buffer[] = { (char) c, 0 };
                     sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::invalidDigitForBase, buffer, base);
@@ -418,7 +436,7 @@ namespace Slang
         return true;
     }
 
-    static bool _maybeLexNumberExponent(Lexer* lexer, int base, LexerFlags flags)
+    static bool _maybeLexNumberExponent(Lexer* lexer, int base)
     {
         if(!_isNumberExponent(_peek(lexer), base))
             return false;
@@ -436,37 +454,37 @@ namespace Slang
 
         // TODO(tfoley): it would be an error to not see digits here...
 
-        _lexDigits(lexer, 10, flags);
+        _lexDigits(lexer, 10);
 
         return true;
     }
 
-    static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base, LexerFlags flags)
+    static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base)
     {
-        _lexDigits(lexer, base, flags);
-        _maybeLexNumberExponent(lexer, base, flags);
+        _lexDigits(lexer, base);
+        _maybeLexNumberExponent(lexer, base);
 
         return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral);
     }
 
-    static TokenType _lexNumber(Lexer* lexer, int base, LexerFlags flags)
+    static TokenType _lexNumber(Lexer* lexer, int base)
     {
         // TODO(tfoley): Need to consider whether to allow any kind of digit separator character.
 
         TokenType tokenType = TokenType::IntegerLiteral;
 
         // At the start of things, we just concern ourselves with digits
-        _lexDigits(lexer, base, flags);
+        _lexDigits(lexer, base);
 
         if( _peek(lexer) == '.' )
         {
             tokenType = TokenType::FloatingPointLiteral;
 
             _advance(lexer);
-            _lexDigits(lexer, base, flags);
+            _lexDigits(lexer, base);
         }
 
-        if( _maybeLexNumberExponent(lexer, base, flags))
+        if( _maybeLexNumberExponent(lexer, base))
         {
             tokenType = TokenType::FloatingPointLiteral;
         }
@@ -669,7 +687,7 @@ namespace Slang
         return value;
     }
 
-    static void _lexStringLiteralBody(Lexer* lexer, char quote, LexerFlags flags)
+    static void _lexStringLiteralBody(Lexer* lexer, char quote)
     {
         for(;;)
         {
@@ -683,14 +701,14 @@ namespace Slang
             switch(c)
             {
             case kEOF:
-                if (auto sink = lexer->getDiagnosticSink(flags))
+                if (auto sink = lexer->getDiagnosticSink())
                 {
                     sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral);
                 }
                 return;
 
             case '\n': case '\r':
-                if (auto sink = lexer->getDiagnosticSink(flags))
+                if (auto sink = lexer->getDiagnosticSink())
                 {
                     sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral);
                 }
@@ -907,41 +925,17 @@ namespace Slang
         return String(content.begin() + 1, content.end() - 1); 
     }
 
-
-
-    static TokenType _lexTokenImpl(Lexer* lexer, LexerFlags effectiveFlags)
+    static TokenType _lexTokenImpl(Lexer* lexer)
     {
-        if(effectiveFlags & kLexerFlag_ExpectDirectiveMessage)
-        {
-            for(;;)
-            {
-                switch(_peek(lexer))
-                {
-                default:
-                    _advance(lexer);
-                    continue;
-
-                case kEOF: case '\r': case '\n':
-                    break;
-                }
-                break;
-            }
-            return TokenType::DirectiveMessage;
-        }
-
         switch(_peek(lexer))
         {
         default:
             break;
 
         case kEOF:
-            if((effectiveFlags & kLexerFlag_InDirective) != 0)
-                return TokenType::EndOfDirective;
             return TokenType::EndOfFile;
 
         case '\r': case '\n':
-            if((effectiveFlags & kLexerFlag_InDirective) != 0)
-                return TokenType::EndOfDirective;
             _handleNewLine(lexer);
             return TokenType::NewLine;
 
@@ -955,7 +949,7 @@ namespace Slang
             {
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9':
-                return _lexNumberAfterDecimalPoint(lexer, 10, effectiveFlags);
+                return _lexNumberAfterDecimalPoint(lexer, 10);
 
             // TODO(tfoley): handle ellipsis (`...`)
 
@@ -965,7 +959,7 @@ namespace Slang
 
         case '1': case '2': case '3': case '4':
         case '5': case '6': case '7': case '8': case '9':
-            return _lexNumber(lexer, 10, effectiveFlags);
+            return _lexNumber(lexer, 10);
 
         case '0':
             {
@@ -978,23 +972,23 @@ namespace Slang
 
                 case '.':
                     _advance(lexer);
-                    return _lexNumberAfterDecimalPoint(lexer, 10, effectiveFlags);
+                    return _lexNumberAfterDecimalPoint(lexer, 10);
 
                 case 'x': case 'X':
                     _advance(lexer);
-                    return _lexNumber(lexer, 16, effectiveFlags);
+                    return _lexNumber(lexer, 16);
 
                 case 'b': case 'B':
                     _advance(lexer);
-                    return _lexNumber(lexer, 2, effectiveFlags);
+                    return _lexNumber(lexer, 2);
 
                 case '0': case '1': case '2': case '3': case '4':
                 case '5': case '6': case '7': case '8': case '9':
-                    if (auto sink = lexer->getDiagnosticSink(effectiveFlags))
+                    if (auto sink = lexer->getDiagnosticSink())
                     {
                         sink->diagnose(loc, LexerDiagnostics::octalLiteral);
                     }
-                    return _lexNumber(lexer, 8, effectiveFlags);
+                    return _lexNumber(lexer, 8);
                 }
             }
 
@@ -1016,12 +1010,12 @@ namespace Slang
 
         case '\"':
             _advance(lexer);
-            _lexStringLiteralBody(lexer, '\"', effectiveFlags);
+            _lexStringLiteralBody(lexer, '\"');
             return TokenType::StringLiteral;
 
         case '\'':
             _advance(lexer);
-            _lexStringLiteralBody(lexer, '\'', effectiveFlags);
+            _lexStringLiteralBody(lexer, '\'');
             return TokenType::CharLiteral;
 
         case '+':
@@ -1202,7 +1196,7 @@ namespace Slang
             auto loc = _getSourceLoc(lexer);
             int c = _advance(lexer);
 
-            if (auto sink = lexer->getDiagnosticSink(effectiveFlags))
+            if (auto sink = lexer->getDiagnosticSink())
             {
                 if(c >= 0x20 && c <=  0x7E)
                 {
@@ -1220,9 +1214,8 @@ namespace Slang
         }
     }
 
-    Token Lexer::lexToken(LexerFlags extraFlags)
+    Token Lexer::lexToken()
     {
-        auto& flags = m_tokenFlags;
         for(;;)
         {
             Token token;
@@ -1230,73 +1223,54 @@ namespace Slang
 
             char const* textBegin = m_cursor;
 
-            auto tokenType = _lexTokenImpl(this, m_lexerFlags | extraFlags);
+            auto tokenType = _lexTokenImpl(this);
 
-            // The low-level lexer produces tokens for things we want
-            // to ignore, such as white space, so we skip them here.
+            // The flags on the token we just lexed will be based
+            // on the current state of the lexer.
+            //
+            auto tokenFlags = m_tokenFlags;
+            //
+            // Depending on what kind of token we just lexed, the
+            // flags that will be used for the *next* token might
+            // need to be updated.
+            //
             switch(tokenType)
             {
-            case TokenType::Invalid:
-                flags = 0;
-                continue;
-
             case TokenType::NewLine:
-                flags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
-                continue;
+                {
+                    // If we just reached the end of a line, then the next token
+                    // should count as being at the start of a line, and also after
+                    // whitespace.
+                    //
+                    m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
+                    break;
+                }
 
             case TokenType::WhiteSpace:
-            {
-                flags |= TokenFlag::AfterWhitespace;
-                continue;
-            }
             case TokenType::BlockComment:
             case TokenType::LineComment:
-            {
-                flags |= TokenFlag::AfterWhitespace;
-                if (m_optionFlags & OptionFlag::TokenizeComments)
                 {
-                    // We don't break here, and use the normal token adding logic
-                    // because we want the behavior to be identical (in terms of flags etc)
-                    // as if TokenizeComments is not enabled
-                    char const* textEnd = m_cursor;
-
-                    token.type =  tokenType;
-                    token.flags = m_tokenFlags;
-                    token.setContent(UnownedStringSlice(textBegin, textEnd));
-
-                    return token;
+                    // True horizontal whitespace and comments both count as whitespace.
+                    //
+                    // Note that a line comment does not include the terminating newline,
+                    // we do not need to set `AtStartOfLine` here.
+                    //
+                    m_tokenFlags |= TokenFlag::AfterWhitespace;
+                    break;
                 }
-
-                continue;
-            }
             
-            // We don't want to skip the end-of-file token, but we *do*
-            // want to make sure it has appropriate flags to make our life easier
-            case TokenType::EndOfFile:
-                flags |= TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
-                break;
-
-            // We will also do some book-keeping around preprocessor directives here:
-            //
-            // If we see a `#` at the start of a line, then we are entering a
-            // preprocessor directive.
-            case TokenType::Pound:
-                if((flags & TokenFlag::AtStartOfLine) != 0)
-                    m_lexerFlags |= kLexerFlag_InDirective;
-                break;
-            //
-            // And if we saw an end-of-line during a directive, then we are
-            // now leaving that directive.
-            //
-            case TokenType::EndOfDirective:
-                m_lexerFlags &= ~kLexerFlag_InDirective;
-                break;
-
             default:
-                break;
+                {
+                    // If we read some token other then the above cases, then we are
+                    // neither after whitespace nor at the start of a line.
+                    //
+                    m_tokenFlags = 0;
+                    break;
+                }
             }
 
             token.type =  tokenType;
+            token.flags = tokenFlags;
 
             char const* textEnd = m_cursor;
 
@@ -1308,7 +1282,7 @@ namespace Slang
                 // Only perform this work if we encountered an escaped newline
                 // while lexing this token (e.g., keep a flag on the lexer), or
                 // do it on-demand when the actual value of the token is needed.
-                if (m_tokenFlags & TokenFlag::ScrubbingNeeded)
+                if (tokenFlags & TokenFlag::ScrubbingNeeded)
                 {
                     // Allocate space that will always be more than enough for stripped contents
                     char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin);
@@ -1348,10 +1322,6 @@ namespace Slang
                 }
             }
 
-            token.flags = flags;
-
-            m_tokenFlags = 0;
-
             if (tokenType == TokenType::Identifier)
             {
                 token.setName(m_namePool->getName(token.getContent()));
@@ -1361,14 +1331,52 @@ namespace Slang
         }
     }
 
-    TokenList Lexer::lexAllTokens()
+    TokenList Lexer::lexAllSemanticTokens()
     {
         TokenList tokenList;
         for(;;)
         {
             Token token = lexToken();
+
+            // We are only interested intokens that are semantically
+            // significant, so we will skip over forms of whitespace
+            // and comments.
+            //
+            switch( token.type )
+            {
+            default:
+                break;
+
+            case TokenType::WhiteSpace:
+            case TokenType::BlockComment:
+            case TokenType::LineComment:
+            case TokenType::NewLine:
+                continue;
+            }
+
             tokenList.add(token);
+            if(token.type == TokenType::EndOfFile)
+                return tokenList;
+        }
+    }
+
+    TokenList Lexer::lexAllMarkupTokens()
+    {
+        TokenList tokenList;
+        for(;;)
+        {
+            Token token = lexToken();
+            switch( token.type )
+            {
+            default:
+                break;
 
+            case TokenType::WhiteSpace:
+            case TokenType::NewLine:
+                continue;
+            }
+
+            tokenList.add(token);
             if(token.type == TokenType::EndOfFile)
                 return tokenList;
         }
diff --git a/source/compiler-core/slang-lexer.h b/source/compiler-core/slang-lexer.h
index 3c8d4ca37..23458d396 100644
--- a/source/compiler-core/slang-lexer.h
+++ b/source/compiler-core/slang-lexer.h
@@ -45,13 +45,21 @@ namespace Slang
         explicit TokenReader(TokenSpan const& tokens)
             : m_cursor(tokens.begin())
             , m_end   (tokens.end  ())
-            , m_nextToken(tokens.begin() ? *tokens.begin() : getEndOfFileToken())
-        {}
+        {
+            _updateLookaheadToken();
+        }
         explicit TokenReader(TokenList const& tokens)
             : m_cursor(tokens.begin())
             , m_end   (tokens.end  ())
-            , m_nextToken(tokens.begin() ? *tokens.begin() : getEndOfFileToken())
-        {}
+        {
+            _updateLookaheadToken();
+        }
+        explicit TokenReader(Token const* begin, Token const* end)
+            : m_cursor(begin)
+            , m_end   (end)
+        {
+            _updateLookaheadToken();
+        }
         struct ParsingCursor
         {
             Token nextToken;
@@ -85,34 +93,25 @@ namespace Slang
         const Token* m_cursor;
         const Token* m_end;
         static Token getEndOfFileToken();
+
+    private:
+            /// Update the lookahead token in `m_nextToken` to reflect the cursor state
+        void _updateLookaheadToken();
     };
 
     typedef unsigned int LexerFlags;
     enum
     {
-        kLexerFlag_InDirective              = 1 << 0, ///< Turn end-of-line and end-of-file into end-of-directive
-        kLexerFlag_ExpectFileName           = 1 << 1, ///< Support `<>` style strings for file paths
-        kLexerFlag_IgnoreInvalid            = 1 << 2, ///< Suppress errors about invalid/unsupported characters
-        kLexerFlag_ExpectDirectiveMessage   = 1 << 3, ///< Don't lexer ordinary tokens, and instead consume rest of line as a string
+        kLexerFlag_SuppressDiagnostics      = 1 << 2, ///< Suppress errors about invalid/unsupported characters
     };
 
     struct Lexer
     {
-        typedef uint32_t OptionFlags;
-        struct OptionFlag
-        {
-            enum Enum : OptionFlags
-            {
-                TokenizeComments         = 1 << 0, ///< If set comments will be output to the token stream
-            };
-        };
-
         void initialize(
             SourceView*     sourceView,
             DiagnosticSink* sink,
             NamePool*       namePool,
-            MemoryArena*    memoryArena,
-            OptionFlags     optionFlags = 0);
+            MemoryArena*    memoryArena);
 
         ~Lexer();
 
@@ -126,12 +125,20 @@ namespace Slang
             /// not needed by the DiagnosticSink.
         static UnownedStringSlice sourceLocationLexer(const UnownedStringSlice& in);
 
-        Token lexToken(LexerFlags extraFlags = 0);
+            /// Lex the next token in the input stream, returning an EOF token if at end.
+        Token lexToken();
 
-        TokenList lexAllTokens();
+            /// Lex all tokens (up to the end of the stream) that are semantically relevant
+        TokenList lexAllSemanticTokens();
 
-            /// Get the diagnostic sink, taking into account flags. Can return nullptr if ignoring invalid
-        DiagnosticSink* getDiagnosticSink(LexerFlags flags) { return ((flags & kLexerFlag_IgnoreInvalid) == 0) ? m_sink : nullptr; }
+            /// Lex all tokens (up to the end of the stream) that are relevant to things like markup
+        TokenList lexAllMarkupTokens();
+
+            /// Get the diagnostic sink, taking into account flags. Will return null if suppressing diagnostics.
+        DiagnosticSink* getDiagnosticSink()
+        {
+            return ((m_lexerFlags & kLexerFlag_SuppressDiagnostics) == 0) ? m_sink : nullptr;
+        }
 
         SourceView*     m_sourceView;
         DiagnosticSink* m_sink;
@@ -147,7 +154,6 @@ namespace Slang
 
         TokenFlags      m_tokenFlags;
         LexerFlags      m_lexerFlags;
-        OptionFlags     m_optionFlags;
 
         MemoryArena*    m_memoryArena;
     };
diff --git a/source/compiler-core/slang-token-defs.h b/source/compiler-core/slang-token-defs.h
index 6cece330e..485429e28 100644
--- a/source/compiler-core/slang-token-defs.h
+++ b/source/compiler-core/slang-token-defs.h
@@ -18,7 +18,6 @@
 
 TOKEN(Unknown,          "<unknown>")
 TOKEN(EndOfFile,        "end of file")
-TOKEN(EndOfDirective,   "end of line")
 TOKEN(Invalid,          "invalid character")
 TOKEN(Identifier,       "identifier")
 TOKEN(IntegerLiteral,   "integer literal")
@@ -26,10 +25,9 @@ TOKEN(FloatingPointLiteral,    "floating-point literal")
 TOKEN(StringLiteral,    "string literal")
 TOKEN(CharLiteral,      "character literal")
 TOKEN(WhiteSpace,       "whitespace")
-TOKEN(NewLine,          "newline")
+TOKEN(NewLine,          "end of line")
 TOKEN(LineComment,      "line comment")
 TOKEN(BlockComment,     "block comment")
-TOKEN(DirectiveMessage, "user-defined message")
 
 #define PUNCTUATION(id, text) \
     TOKEN(id, "'" text "'")
diff --git a/source/compiler-core/slang-token.h b/source/compiler-core/slang-token.h
index 9697a5c2d..7feda6824 100644
--- a/source/compiler-core/slang-token.h
+++ b/source/compiler-core/slang-token.h
@@ -26,9 +26,8 @@ struct TokenFlag
     {
         AtStartOfLine           = 1 << 0,
         AfterWhitespace         = 1 << 1,
-        SuppressMacroExpansion  = 1 << 2,
-        ScrubbingNeeded         = 1 << 3,
-        Name                    = 1 << 4,           ///< Determines if 'name' is set or 'chars' in the charsNameUnion
+        ScrubbingNeeded         = 1 << 2,
+        Name                    = 1 << 3,           ///< Determines if 'name' is set or 'chars' in the charsNameUnion
     };
 };