Make source location lightweight

Fixes #24 So far the code has used a representation for source locations that is heavy-weight, but typical of research or hobby compilers: a `struct` type containing a line number and a (heap-allocated) string. This is actually very convenient for debugging, but it means that any data structure that might contain a source location needs careful memory management (because of those strings) and has a tendency to bloat. The new represnetation is that a source location is just a pointer-sized integer. In the simplest mental model, you can think of this as just counting every byte of source text that is passed in, and using those to name locations. Finding the path and line number that corresponds to a location involves a lookup step, but we can arrange to store all the files in an array sorted by their start locations, and do a binary search. Finding line numbers inside a file is similarly fast (one you pay a one-time cost to build an array of starting offsets for lines). More advanced compilers like clang actually go further and create a unique range of source locations to represent a file each time it gets included, so that they can track the include stack and reproduce it in diagnostic messages. I'm not doing anything that clever here.
author: Tim Foley <tfoley@nvidia.com> 2017-08-09 12:57:37 -0700
committer: Tim Foley <tfoley@nvidia.com> 2017-08-10 13:05:04 -0700
commit: a5a436c4783fb75a0d089a6483219c06db91f593 (patch)
tree: 224c16ad374c5ed533a497beeb75753e7ce2d771 /source/slang/lexer.cpp
parent: 6e4830f4d74adef0a47c6503d84dc114240fafa3 (diff)
1 files changed, 53 insertions, 28 deletions
diff --git a/source/slang/lexer.cpp b/source/slang/lexer.cpp
index 84b34c9a9..fe01878d1 100644
--- a/source/slang/lexer.cpp
+++ b/source/slang/lexer.cpp
@@ -1,4 +1,12 @@
-#include "Lexer.h"
+// lexer.cpp
+#include "lexer.h"
+
+// This file implements the lexer/scanner, which is responsible for taking a raw stream of
+// input bytes and turning it into semantically useful tokens.
+//
+
+#include "compiler.h"
+#include "source-loc.h"
 
 #include <assert.h>
 
@@ -6,7 +14,7 @@ namespace Slang
 {
     static Token GetEndOfFileToken()
     {
-        return Token(TokenType::EndOfFile, "", 0, 0, 0, "");
+        return Token(TokenType::EndOfFile, "", SourceLoc());
     }
 
     Token* TokenList::begin() const
@@ -52,10 +60,10 @@ namespace Slang
         return mCursor->type;
     }
 
-    CodePosition TokenReader::PeekLoc() const
+    SourceLoc TokenReader::PeekLoc() const
     {
         if (!mCursor)
-            return CodePosition();
+            return SourceLoc();
         SLANG_ASSERT(mCursor);
         return mCursor->Position;
     }
@@ -75,18 +83,22 @@ namespace Slang
 
     // Lexer
 
-    Lexer::Lexer(
-        String const&   path,
-        String const&   content,
-        DiagnosticSink* sink)
-        : path(path)
-        , content(content)
-        , sink(sink)
+    void Lexer::initialize(
+        SourceFile*     inSourceFile,
+        DiagnosticSink* inSink)
     {
+        sourceFile = inSourceFile;
+        sink = inSink;
+
+        auto content = inSourceFile->content;
+
+        begin   = content.begin();
         cursor  = content.begin();
         end     = content.end();
 
-        loc = CodePosition(1, 1, 0, path);
+        spellingStartLoc = inSourceFile->sourceRange.begin;
+        presumedStartLoc = spellingStartLoc;
+
         tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
         lexerFlags = 0;
     }
@@ -142,9 +154,6 @@ namespace Slang
         {
             advanceRaw(lexer);
         }
-
-        lexer->loc.Line++;
-        lexer->loc.Col = 1;
     }
 
     // Look ahead one code point, dealing with complications like
@@ -224,12 +233,7 @@ namespace Slang
 
             // TODO: Need to handle non-ASCII code points.
 
-            // Default case is to advance by one location
-            // and return the raw byte we saw.
-
-            lexer->loc.Col++;
-            lexer->loc.Pos++;
-
+            // Default case is to return the raw byte we saw.
             return c;
         }
     }
@@ -323,6 +327,27 @@ namespace Slang
         }
     }
 
+    static SourceLoc getSourceLoc(Lexer* lexer)
+    {
+        return lexer->presumedStartLoc + (lexer->cursor - lexer->begin);
+    }
+
+    // Begin overriding the reported locations of tokens,
+    // based on a `#line` directives
+    void Lexer::startOverridingSourceLocations(
+        SourceLoc loc)
+    {
+        if(loc.isValid())
+        {
+            presumedStartLoc = loc;
+        }
+    }
+
+    void Lexer::stopOverridingSourceLocations()
+    {
+        presumedStartLoc = spellingStartLoc;
+    }
+
     static void lexDigits(Lexer* lexer, int base)
     {
         for(;;)
@@ -355,7 +380,7 @@ namespace Slang
             if(digitVal >= base)
             {
                 char buffer[] = { (char) c, 0 };
-                lexer->sink->diagnose(lexer->loc, Diagnostics::invalidDigitForBase, buffer, base);
+                lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::invalidDigitForBase, buffer, base);
             }
 
             advance(lexer);
@@ -695,11 +720,11 @@ namespace Slang
             switch(c)
             {
             case kEOF:
-                lexer->sink->diagnose(lexer->loc, Diagnostics::endOfFileInLiteral);
+                lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::endOfFileInLiteral);
                 return;
 
             case '\n': case '\r':
-                lexer->sink->diagnose(lexer->loc, Diagnostics::newlineInLiteral);
+                lexer->sink->diagnose(getSourceLoc(lexer), Diagnostics::newlineInLiteral);
                 return;
 
             case '\\':
@@ -952,7 +977,7 @@ namespace Slang
 
         case '0':
             {
-                auto loc = lexer->loc;
+                auto loc = getSourceLoc(lexer);
                 advance(lexer);
                 switch(peek(lexer))
                 {
@@ -1170,7 +1195,7 @@ namespace Slang
             // If none of the above cases matched, then we have an
             // unexpected/invalid character.
 
-            auto loc = lexer->loc;
+            auto loc = getSourceLoc(lexer);
             auto sink = lexer->sink;
             int c = advance(lexer);
             if(c >= 0x20 && c <=  0x7E)
@@ -1194,7 +1219,7 @@ namespace Slang
         for(;;)
         {
             Token token;
-            token.Position = loc;
+            token.Position = getSourceLoc(this);
 
             char const* textBegin = cursor;
 
@@ -1314,7 +1339,7 @@ namespace Slang
     TokenList Lexer::Parse(const String & fileName, const String & str, DiagnosticSink * sink)
     {
         TokenList tokenList;
-        tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, CodePosition pos)
+        tokenList.mTokens = TokenizeText(fileName, str, [&](TokenizeErrorType errType, SourceLoc pos)
         {
             auto curChar = str[pos.Pos];
             switch (errType)
author	Tim Foley <tfoley@nvidia.com>	2017-08-09 12:57:37 -0700
committer	Tim Foley <tfoley@nvidia.com>	2017-08-10 13:05:04 -0700
commit	a5a436c4783fb75a0d089a6483219c06db91f593 (patch)
tree	224c16ad374c5ed533a497beeb75753e7ce2d771 /source/slang/lexer.cpp
parent	6e4830f4d74adef0a47c6503d84dc114240fafa3 (diff)