From b53041b187818186be862b411469f24919908f07 Mon Sep 17 00:00:00 2001 From: Tim Foley Date: Mon, 12 Jun 2017 14:20:50 -0700 Subject: Preprocessor: fix bug with multi-argument macros. There was a subtle bug when a function-like macro with multiple arguments expands to use the arguments one after the other: #define FOO(a,b) a b FOO(int, x); During expansion, the input streams look something like (using `.` to represent the cursor): // macro invocation: FOO(int, x) . ; // macro expansion of `FOO(int, x)` a . b // macro argument `a` int . That is, we are at the end of the first argument's tokens. When "peeking" the next token, we correctly work up the list of active streams until we find one that isn't at its end, and that gives us the token `b`. But then we need to look up `b` in an appropriate environment to find what it is bound to. Each of the streams above has an environment asociated with it, and in particular, `b` is only defined in the middle environment, because that is where the macro arguments were registered. The simple fix here is to make the lookup logic for finding an environment follow the same logic as finding the next token. A more complete fix down the line could involve getting rid of the approach of allowing an input stream to be "active" but at its end. I believe this was originally required to handle some error cases in directives, where we'd want to keep the input stream for one file active until we are done parsing a full directive from it (e.g., if a directive is on the last line of the file). Now that we generate an explicit "end of directive" token, that may not be required. --- source/slang/preprocessor.cpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'source') diff --git a/source/slang/preprocessor.cpp b/source/slang/preprocessor.cpp index cdde2591d..60329c275 100644 --- a/source/slang/preprocessor.cpp +++ b/source/slang/preprocessor.cpp @@ -450,8 +450,35 @@ static PreprocessorMacro* LookupMacro(PreprocessorEnvironment* environment, Stri static PreprocessorEnvironment* GetCurrentEnvironment(Preprocessor* preprocessor) { + // The environment we will use for looking up a macro is assocaited + // with the current input stream (because it may include entries + // for macro arguments). + // + // We need to be careful, though, when we are at the end of an + // input stream (e.g., representing one argument), so that we + // don't use its environment. + PreprocessorInputStream* inputStream = preprocessor->inputStream; - return inputStream ? inputStream->environment : &preprocessor->globalEnv; + + for(;;) + { + // If there is no input stream that isn't at its end, + // then fall back to the global environment. + if (!inputStream) + return &preprocessor->globalEnv; + + // If the current input stream is at its end, then + // fall back to its parent stream. + if (inputStream->tokenReader.PeekTokenType() == TokenType::EndOfFile) + { + inputStream = inputStream->parent; + continue; + } + + // If we've found an active stream that isn't at its end, + // then use that for lookup. + return inputStream->environment; + } } static PreprocessorMacro* LookupMacro(Preprocessor* preprocessor, String const& name) -- cgit v1.2.3 From e95b6312b6cecd6073f801ad1c9a7ab11d50dfc3 Mon Sep 17 00:00:00 2001 From: Tim Foley Date: Mon, 12 Jun 2017 14:28:00 -0700 Subject: Lexer: handle escaped newlines This is mostly to allow for the idiomatic style of defining a multi-line macro in C: #define FOO(a,b) \ x(a) \ y(b) \ /* end */ The handling is reasonably general: in the lexer whenever we need to consume or "peek" the next code point, we check if we are at the start of a backslash-newline sequence, and if so we skip past that to find what we were looking for. However, the way I'm handling things right now there is no step taken to "clean" a token and remove the backslash or newline from its value, so downstream code that actually inspects token values will probably break if users start putting escaped newlines in the middle of names. We can fix that issue when (if) it comes up. --- slang.sln | 2 - source/slang/lexer.cpp | 116 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 107 insertions(+), 11 deletions(-) (limited to 'source') diff --git a/slang.sln b/slang.sln index ba96d6014..502bd60a1 100644 --- a/slang.sln +++ b/slang.sln @@ -8,8 +8,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hello", "examples\hello\hel EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "core", "source\core\core.vcxproj", "{F9BE7957-8399-899E-0C49-E714FDDD4B65}" EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "utils", "utils", "{37016FF6-E6AF-4316-BC2B-0152FC0C969E}" -EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{74C5F0DC-93BB-4BF3-AC65-8C65491570F7}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "slang", "source\slang\slang.vcxproj", "{DB00DA62-0533-4AFD-B59F-A67D5B3A0808}" diff --git a/source/slang/lexer.cpp b/source/slang/lexer.cpp index 7234c4983..5127c876c 100644 --- a/source/slang/lexer.cpp +++ b/source/slang/lexer.cpp @@ -99,40 +99,138 @@ namespace Slang enum { kEOF = -1 }; - static int peek(Lexer* lexer) + // Get the next input byte, without any handling of + // escaped newlines, non-ASCII code points, source locations, etc. + static int peekRaw(Lexer* lexer) { + // If we are at the end of the input, return a designated end-of-file value if(lexer->cursor == lexer->end) return kEOF; + // Otherwise, just look at the next byte return *lexer->cursor; } - static int advance(Lexer* lexer) + // Read one input byte without any special handling (similar to `peekRaw`) + static int advanceRaw(Lexer* lexer) { - if(lexer->cursor == lexer->end) - return kEOF; + // The logic here is basically the same as for `peekRaw()`, + // escape we advance `cursor` if we aren't at the end. - lexer->loc.Col++; - lexer->loc.Pos++; + if (lexer->cursor == lexer->end) + return kEOF; return *lexer->cursor++; } + // When the cursor is already at the first byte of an end-of-line sequence, + // consume one or two bytes that compose the sequence. + // + // Basically, a newline is one of: + // + // "\n" + // "\r" + // "\r\n" + // "\n\r" + // + // We always look for the longest match possible. + // static void handleNewLine(Lexer* lexer) { - int c = advance(lexer); + int c = advanceRaw(lexer); assert(c == '\n' || c == '\r'); - int d = peek(lexer); + int d = peekRaw(lexer); if( (c ^ d) == ('\n' ^ '\r') ) { - advance(lexer); + advanceRaw(lexer); } lexer->loc.Line++; lexer->loc.Col = 1; } + // Look ahead one code point, dealing with complications like + // escaped newlines. + static int peek(Lexer* lexer) + { + // Look at the next raw byte, and decide what to do + int c = peekRaw(lexer); + + if(c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = lexer->cursor[1]; + switch (d) + { + case '\r': case '\n': + // The newline was escaped, so return the character after *that* + return lexer->cursor[2]; + + default: + break; + } + } + // TODO: handle UTF-8 encoding for non-ASCII code points here + + // Default case is to just hand along the byte we read as an ASCII code point. + return c; + } + + // Get the next code point from the input, and advance the cursor. + static int advance(Lexer* lexer) + { + // We are going to loop, but only as a way of handling + // escaped line endings. + for (;;) + { + // If we are at the end of the input, then the task is easy. + if (lexer->cursor == lexer->end) + return kEOF; + + // Look at the next raw byte, and decide what to do + int c = *lexer->cursor++; + + if (c == '\\') + { + // We might have a backslash-escaped newline. + // Look at the next byte (if any) to see. + // + // Note(tfoley): We are assuming a null-terminated input here, + // so that we can safely look at the next byte without issue. + int d = *lexer->cursor; + switch (d) + { + case '\r': case '\n': + // handle the end-of-line for our source location tracking + handleNewLine(lexer); + + // Now try again, looking at the character after the + // escaped nmewline. + continue; + + default: + break; + } + } + + // TODO: Need to handle non-ASCII code points. + + // Default case is to advance by one location + // and return the raw byte we saw. + + lexer->loc.Col++; + lexer->loc.Pos++; + + return c; + } + } + + static void lexLineComment(Lexer* lexer) { for(;;) -- cgit v1.2.3