summaryrefslogtreecommitdiffstats
path: root/source/slang/lexer.cpp
diff options
context:
space:
mode:
authorTim Foley <tfoley@nvidia.com>2017-08-09 12:57:37 -0700
committerTim Foley <tfoley@nvidia.com>2017-08-10 13:05:04 -0700
commita5a436c4783fb75a0d089a6483219c06db91f593 (patch)
tree224c16ad374c5ed533a497beeb75753e7ce2d771 /source/slang/lexer.cpp
parent6e4830f4d74adef0a47c6503d84dc114240fafa3 (diff)
Make source location lightweight
Fixes #24 So far the code has used a representation for source locations that is heavy-weight, but typical of research or hobby compilers: a `struct` type containing a line number and a (heap-allocated) string. This is actually very convenient for debugging, but it means that any data structure that might contain a source location needs careful memory management (because of those strings) and has a tendency to bloat. The new represnetation is that a source location is just a pointer-sized integer. In the simplest mental model, you can think of this as just counting every byte of source text that is passed in, and using those to name locations. Finding the path and line number that corresponds to a location involves a lookup step, but we can arrange to store all the files in an array sorted by their start locations, and do a binary search. Finding line numbers inside a file is similarly fast (one you pay a one-time cost to build an array of starting offsets for lines). More advanced compilers like clang actually go further and create a unique range of source locations to represent a file each time it gets included, so that they can track the include stack and reproduce it in diagnostic messages. I'm not doing anything that clever here.
Diffstat (limited to 'source/slang/lexer.cpp')
-rw-r--r--source/slang/lexer.cpp81
1 files changed, 53 insertions, 28 deletions
diff --git a/source/slang/lexer.cpp b/source/slang/lexer.cpp
index 84b34c9a9..fe01878d1 100644
--- a/source/slang/lexer.cpp
+++ b/source/slang/lexer.cpp
@@ -1,4 +1,12 @@
-#include "Lexer.h"
+// lexer.cpp
+#include "lexer.h"
+
+// This file implements the lexer/scanner, which is responsible for taking a raw stream of
+// input bytes and turning it into semantically useful tokens.
+//
+
+#include "compiler.h"
+#include "source-loc.h"
#include <assert.h>
@@ -6,7 +14,7 @@ namespace Slang
{
static Token GetEndOfFileToken()
{
- return Token(TokenType::EndOfFile, "", 0, 0, 0, "");
+ return Token(TokenType::EndOfFile, "", SourceLoc());
}
Token* TokenList::begin() const
@@ -52,10 +60,10 @@ namespace Slang
return mCursor->type;
}
- CodePosition TokenReader::PeekLoc() const
+ SourceLoc TokenReader::PeekLoc() const
{
if (!mCursor)
- return CodePosition();
+ return SourceLoc();
SLANG_ASSERT(mCursor);
return mCursor->Position;
}
@@ -75,18 +83,22 @@ namespace Slang
// Lexer
- Lexer::Lexer(
- String const& path,
- String const& content,
- DiagnosticSink* sink)
- : path(path)
- , content(content)
- , sink(sink)
+ void Lexer::initialize(
+ SourceFile* inSourceFile,
+ DiagnosticSink* inSink)
{
+ sourceFile = inSourceFile;
+ sink = inSink;
+
+ auto content = inSourceFile->content;
+
+ begin = content.begin();
cursor = content.begin();
end = content.end();
- loc = CodePosition(1, 1, 0, path);
+ spellingStartLoc = inSourceFile->sourceRange.begin;
+ presumedStartLoc = spellingStartLoc;
+
tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
lexerFlags = 0;
}
@@ -142,9 +154,6 @@ namespace Slang
{
advanceRaw(lexer);
}
-
- lexer->loc.Line++;
- lexer->loc.Col = 1;
}
// Look ahead one code point, dealing with complications like
@@ -224,12 +233,7 @@ namespace Slang
// TODO: Need to handle non-ASCII code points.
- // Default case is to advance by one location
- // and return the raw byte we saw.
-
- lexer->loc.Col++;
- lexer->loc.Pos++;
-
+ // Default case is to return the raw byte we saw.
return c;
}
}
@@ -323,6 +327,27 @@ namespace Slang
}
}
+ static SourceLoc getSourceLoc(Lexer* lexer)
+ {
+ return lexer->presumedStartLoc + (lexer->cursor - lexer->begin);
+ }
+
+ // Begin overriding the reported locations of tokens,
+ // based on a `#line` directives
+ void Lexer::startOverridingSourceLocations(
+ SourceLoc loc)
+ {
+ if(loc.isValid())
+ {
+ presumedStartLoc = loc;
+ }
+ }
+
+ void Lexer::stopOverridingSourceLocations()
+ {
+ presumedStartLoc = spellingStartLoc;
+ }
+
static void lexDigits(Lexer* lexer, int base)
{
for(;;)
@@ -355,7 +380,7 @@ namespace Slang
if(digitVal >= base)
{
char buffer[] = { (char) c, 0 };
- lexer->sink->diagnose(lexer->loc, Diagnostics::invalidDigitForBase, buffer, base);
+ lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::invalidDigitForBase, buffer, base);
}
advance(lexer);
@@ -695,11 +720,11 @@ namespace Slang
switch(c)
{
case kEOF:
- lexer->sink->diagnose(lexer->loc, Diagnostics::endOfFileInLiteral);
+ lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::endOfFileInLiteral);
return;
case '\n': case '\r':
- lexer->sink->diagnose(lexer->loc, Diagnostics::newlineInLiteral);
+ lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::newlineInLiteral);
return;
case '\\':
@@ -952,7 +977,7 @@ namespace Slang
case '0':
{
- auto loc = lexer->loc;
+ auto loc = getSourceLoc(lexer);
advance(lexer);
switch(peek(lexer))
{
@@ -1170,7 +1195,7 @@ namespace Slang
// If none of the above cases matched, then we have an
// unexpected/invalid character.
- auto loc = lexer->loc;
+ auto loc = getSourceLoc(lexer);
auto sink = lexer->sink;
int c = advance(lexer);
if(c >= 0x20 && c <= 0x7E)
@@ -1194,7 +1219,7 @@ namespace Slang
for(;;)
{
Token token;
- token.Position = loc;
+ token.Position = getSourceLoc(this);
char const* textBegin = cursor;
@@ -1314,7 +1339,7 @@ namespace Slang
TokenList Lexer::Parse(const String & fileName, const String & str, DiagnosticSink * sink)
{
TokenList tokenList;
- tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, CodePosition pos)
+ tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, SourceLoc pos)
{
auto curChar = str[pos.Pos];
switch (errType)