diff options
| author | Yong He <yonghe@outlook.com> | 2025-06-18 01:38:29 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-06-18 08:38:29 +0000 |
| commit | 8c100305c8e1ac29b008e7cb95c0498b6adf197c (patch) | |
| tree | 50c5e52a417f22cc66ad2b85dcc10f2ce9bc1dc5 | |
| parent | 4d517794eaac7dfe6196e9a36d709d66c5720492 (diff) | |
Fix out of bound buffer access in the preprocessor. (#7475)
* Fix out of bound buffer access in the preprocessor.
* Fix test regression.
---------
Co-authored-by: Jay Kwak <82421531+jkwak-work@users.noreply.github.com>
| -rw-r--r-- | source/compiler-core/slang-lexer.cpp | 75 |
1 files changed, 60 insertions, 15 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp index 358a0ceff..119c34d48 100644 --- a/source/compiler-core/slang-lexer.cpp +++ b/source/compiler-core/slang-lexer.cpp @@ -181,7 +181,7 @@ static int _peek(Lexer* lexer, int offset = 0) do { - if (lexer->m_cursor + pos == lexer->m_end) + if (lexer->m_cursor + pos >= lexer->m_end) return kEOF; c = lexer->m_cursor[pos++]; @@ -219,11 +219,20 @@ static int _peek(Lexer* lexer, int offset = 0) { // Consume all unicode characters. pos--; - c = getUnicodePointFromUTF8([&]() { return lexer->m_cursor[pos++]; }); + c = getUnicodePointFromUTF8( + [&]() + { + if (lexer->m_cursor + pos >= lexer->m_end) + return (char)0; + return lexer->m_cursor[pos++]; + }); } // Default case is to just hand along the byte we read as an ASCII code point. } while (offset--); + // If we encounter a \0, return kEOF. + // if (c == 0) + // return kEOF; return c; } @@ -235,7 +244,7 @@ static int _advance(Lexer* lexer) for (;;) { // If we are at the end of the input, then the task is easy. - if (lexer->m_cursor == lexer->m_end) + if (lexer->m_cursor >= lexer->m_end) return kEOF; // Look at the next raw byte, and decide what to do @@ -269,10 +278,26 @@ static int _advance(Lexer* lexer) } // Consume all unicode characters. + bool isInvalidStream = false; if (isUtf8LeadingByte((Byte)c)) { lexer->m_cursor--; - c = getUnicodePointFromUTF8([&]() { return *lexer->m_cursor++; }); + c = getUnicodePointFromUTF8( + [&]() + { + if (lexer->m_cursor >= lexer->m_end) + { + isInvalidStream = true; + return (char)0; + } + return *lexer->m_cursor++; + }); + } + + // If we encounter a \0, return kEOF, and move stream cursor to the end. + if (c == 0 || isInvalidStream) + { + lexer->m_cursor = lexer->m_end; } // Default case is to return the raw byte we saw. @@ -280,6 +305,24 @@ static int _advance(Lexer* lexer) } } +static const int kMaxLexErrorCount = 100; + +template<typename P, typename... Args> +static void diagnose( + DiagnosticSink* sink, + const P& loc, + const DiagnosticInfo& info, + const Args&... args) +{ + if (!sink) + return; + + // Cap max errors to avoid flooding the sink memory. + if (sink->getErrorCount() > kMaxLexErrorCount) + return; + sink->diagnose(loc, info, args...); +} + static void _handleNewLine(Lexer* lexer) { int c = _advance(lexer); @@ -439,7 +482,8 @@ static void _lexDigits(Lexer* lexer, int base) if (auto sink = lexer->getDiagnosticSink()) { char buffer[] = {(char)c, 0}; - sink->diagnose( + diagnose( + sink, _getSourceLoc(lexer), LexerDiagnostics::invalidDigitForBase, buffer, @@ -855,7 +899,7 @@ static void _lexStringLiteralBody(Lexer* lexer, char quote, bool singleChar) { // Empty char literal - size must be exactly 1. if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::illegalCharacterLiteral); + diagnose(sink, _getSourceLoc(lexer), LexerDiagnostics::illegalCharacterLiteral); } } _advance(lexer); @@ -868,7 +912,7 @@ static void _lexStringLiteralBody(Lexer* lexer, char quote, bool singleChar) { // Char literal about to have more than 1 char. if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::illegalCharacterLiteral); + diagnose(sink, _getSourceLoc(lexer), LexerDiagnostics::illegalCharacterLiteral); } } @@ -877,7 +921,7 @@ static void _lexStringLiteralBody(Lexer* lexer, char quote, bool singleChar) case kEOF: if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); + diagnose(sink, _getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); } return; @@ -885,7 +929,7 @@ static void _lexStringLiteralBody(Lexer* lexer, char quote, bool singleChar) case '\r': if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral); + diagnose(sink, _getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral); } return; @@ -978,7 +1022,7 @@ static void _lexRawStringLiteralBody(Lexer* lexer) { if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::quoteCannotBeDelimiter); + diagnose(sink, _getSourceLoc(lexer), LexerDiagnostics::quoteCannotBeDelimiter); } } else @@ -1002,7 +1046,7 @@ static void _lexRawStringLiteralBody(Lexer* lexer) case kEOF: if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); + diagnose(sink, _getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral); } return; default: @@ -1303,7 +1347,7 @@ static TokenType _lexTokenImpl(Lexer* lexer) case '9': if (auto sink = lexer->getDiagnosticSink()) { - sink->diagnose(loc, LexerDiagnostics::octalLiteral); + diagnose(sink, loc, LexerDiagnostics::octalLiteral); } return _lexNumber(lexer, 8); } @@ -1658,16 +1702,17 @@ static TokenType _lexTokenImpl(Lexer* lexer) if (c >= 0x20 && c <= 0x7E) { char buffer[] = {(char)c, 0}; - sink->diagnose(loc, LexerDiagnostics::illegalCharacterPrint, buffer); + diagnose(sink, loc, LexerDiagnostics::illegalCharacterPrint, buffer); } else if (c == kEOF) { - sink->diagnose(loc, LexerDiagnostics::unexpectedEndOfInput); + diagnose(sink, loc, LexerDiagnostics::unexpectedEndOfInput); } else { // Fallback: print as hexadecimal - sink->diagnose( + diagnose( + sink, loc, LexerDiagnostics::illegalCharacterHex, String((unsigned char)c, 16)); |
