JSON Lexing and string encoding/decoding (#1858)

* #include an absolute path didn't work - because paths were taken to always be relative. * WIP Json lexer. * Check JSON Lex with unit test * Add JSON escaping/unescaping of strings. * Big fix encoding/decoding. * Fix typo in JSON diagnostics. * Fix typo. * Better float testing.
author: jsmall-nvidia <jsmall@nvidia.com> 2021-05-25 20:58:43 -0400
committer: GitHub <noreply@github.com> 2021-05-25 20:58:43 -0400
commit: 7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree: 6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source/compiler-core
parent: 89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)
5 files changed, 570 insertions, 0 deletions
diff --git a/source/compiler-core/slang-json-diagnostic-defs.h b/source/compiler-core/slang-json-diagnostic-defs.h
new file mode 100644
index 000000000..a4b260857
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostic-defs.h
@@ -0,0 +1,37 @@
+//
+
+// The file is meant to be included multiple times, to produce different
+// pieces of declaration/definition code related to diagnostic messages
+//
+// Each diagnostic is declared here with:
+//
+//     DIAGNOSTIC(id, severity, name, messageFormat)
+//
+// Where `id` is the unique diagnostic ID, `severity` is the default
+// severity (from the `Severity` enum), `name` is a name used to refer
+// to this diagnostic from code, and `messageFormat` is the default
+// (non-localized) message for the diagnostic, with placeholders
+// for any arguments.
+
+#ifndef DIAGNOSTIC
+#error Need to #define DIAGNOSTIC(...) before including 
+#define DIAGNOSTIC(id, severity, name, messageFormat) /* */
+#endif
+
+//
+// -1 - Notes that decorate another diagnostic.
+//
+
+//
+// 2xxxx - JSON Lexical analysis
+//
+
+DIAGNOSTIC(20000, Error, unexpectedCharacter, "unexpected character '$0'")
+DIAGNOSTIC(20001, Error, endOfFileInLiteral, "end of file in literal")
+DIAGNOSTIC(20002, Error, newlineInLiteral, "newline in literal")
+DIAGNOSTIC(20003, Error, endOfFileInComment, "end of file in comment")
+DIAGNOSTIC(20004, Error, expectingAHexDigit, "expecting a hex digit")
+DIAGNOSTIC(20005, Error, expectingADigit, "expecting a digit")
+DIAGNOSTIC(20006, Error, expectingValueName, "expecting value name [null, true, false]")
+
+#undef DIAGNOSTIC
diff --git a/source/compiler-core/slang-json-diagnostics.cpp b/source/compiler-core/slang-json-diagnostics.cpp
new file mode 100644
index 000000000..1d35e8faf
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostics.cpp
@@ -0,0 +1,33 @@
+// slang-json-diagnostics.cpp
+#include "slang-json-diagnostics.h"
+
+namespace Slang {
+
+namespace JSONDiagnostics
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) const DiagnosticInfo name = { id, Severity::severity, #name, messageFormat };
+#include "slang-json-diagnostic-defs.h"
+#undef DIAGNOSTIC
+}
+
+static const DiagnosticInfo* const kJSONDiagnostics[] =
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) &JSONDiagnostics::name, 
+#include "slang-json-diagnostic-defs.h"
+#undef DIAGNOSTIC
+};
+
+static DiagnosticsLookup* _newJSONDiagnosticsLookup()
+{
+    auto lookup = new DiagnosticsLookup;
+    lookup->add(kJSONDiagnostics, SLANG_COUNT_OF(kJSONDiagnostics));
+    return lookup;
+}
+
+DiagnosticsLookup* getJSONDiagnosticsLookup()
+{
+    static RefPtr<DiagnosticsLookup> s_lookup = _newJSONDiagnosticsLookup();
+    return s_lookup;
+}
+
+} // namespace Slang
diff --git a/source/compiler-core/slang-json-diagnostics.h b/source/compiler-core/slang-json-diagnostics.h
new file mode 100644
index 000000000..88ec0c550
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostics.h
@@ -0,0 +1,26 @@
+#ifndef SLANG_JSON_DIAGNOSTICS_H
+#define SLANG_JSON_DIAGNOSTICS_H
+
+#include "../core/slang-basic.h"
+#include "../core/slang-writer.h"
+
+#include "slang-source-loc.h"
+#include "slang-diagnostic-sink.h"
+#include "slang-token.h"
+
+#include "../../slang.h"
+
+namespace Slang
+{
+
+DiagnosticsLookup* getJSONDiagnosticsLookup();
+
+namespace JSONDiagnostics
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) extern const DiagnosticInfo name;
+#include "slang-json-diagnostic-defs.h"
+}
+
+}
+
+#endif
diff --git a/source/compiler-core/slang-json-lexer.cpp b/source/compiler-core/slang-json-lexer.cpp
new file mode 100644
index 000000000..19a5b29a7
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.cpp
@@ -0,0 +1,385 @@
+// slang-json-lexer.cpp
+#include "slang-json-lexer.h"
+
+#include "slang-json-diagnostics.h"
+#include "../core/slang-char-util.h"
+
+/*
+https://www.json.org/json-en.html
+*/
+
+namespace Slang {
+
+SlangResult JSONLexer::init(SourceView* sourceView, DiagnosticSink* sink) 
+{
+    m_sourceView = sourceView;
+    m_sink = sink;
+
+    SourceFile* sourceFile = sourceView->getSourceFile();
+
+    // Note that the content must be null terminated (because of other requirements)
+    SLANG_ASSERT(sourceFile && sourceFile->hasContent());
+
+    m_contentStart = sourceFile->getContent().begin();
+
+    m_startLoc = sourceView->getRange().begin;
+
+    m_lexemeStart = m_contentStart;
+    m_cursor = m_lexemeStart;
+
+    // We need to prime the first token
+    advance();
+
+    return SLANG_OK;
+}
+
+SLANG_FORCE_INLINE static const char* _handleEndOfLine(char c, const char* cursor)
+{
+    SLANG_ASSERT(c == '\n' || c == '\r');
+    const char d = *cursor;
+    return cursor + Index((c ^ d) == ('\n' ^ '\r'));
+}
+
+JSONTokenType JSONLexer::_setInvalidToken()
+{
+    return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+}
+
+JSONTokenType JSONLexer::advance()
+{
+    const char* cursor = m_cursor;
+
+    while (true)
+    {
+        m_lexemeStart = cursor;
+
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:     return _setToken(JSONTokenType::EndOfFile, cursor - 1);
+            case '"':
+            {
+                cursor = _lexString(cursor);
+                if (cursor == nullptr)
+                {
+                    return _setInvalidToken();
+                }
+                return _setToken(JSONTokenType::StringLiteral, cursor);
+            }
+            case '/':
+            {
+                // We allow comments
+                const char nextChar = *m_cursor;
+
+                if (nextChar == '/')
+                {
+                    // Line comment
+                    cursor = _lexLineComment(cursor);
+                    break;
+                }
+                else if (nextChar == '*')
+                {
+                    cursor = _lexBlockComment(cursor);
+                    // Can fail... 
+                    if (cursor == nullptr)
+                    {
+                        return _setInvalidToken();
+                    }
+                    break;
+                }
+            }
+            case ' ':
+            case '\t':
+            case '\n':
+            case '\r':
+            {
+                cursor = _lexWhitespace(cursor);
+                break;
+            }
+            case ':':           return _setToken(JSONTokenType::Colon, cursor);
+            case ',':           return _setToken(JSONTokenType::Comma, cursor);
+            case '[':           return _setToken(JSONTokenType::LBracket, cursor);
+            case ']':           return _setToken(JSONTokenType::RBracket, cursor);
+            case '{':           return _setToken(JSONTokenType::LBrace, cursor);
+            case '}':           return _setToken(JSONTokenType::RBrace, cursor);
+
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                LexResult res = _lexNumber(cursor - 1);
+                if (res.cursor == nullptr)
+                {
+                    return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+                }
+                return _setToken(res.type, res.cursor);
+            }
+            case 't':
+            {
+                if (cursor[0] == 'r' && cursor[1] == 'u' && cursor[2] == 'e')
+                {
+                    return _setToken(JSONTokenType::True, cursor + 3);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            case 'f':
+            {
+                if (cursor[0] == 'a' && cursor[1] == 'l' && cursor[2] == 's' && cursor[3] == 'e')
+                {
+                    return _setToken(JSONTokenType::False, cursor + 4);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            case 'n':
+            {
+                if (cursor[0] == 'u' && cursor[1] == 'l' && cursor[2] == 'l')
+                {
+                    return _setToken(JSONTokenType::Null, cursor + 3);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            default:
+            {
+                StringBuilder buf;
+                if (c <= ' ' || c >= 0x7e)
+                {
+                    static const char s_hex[] = "012345679abcdef";
+
+                    char hexBuf[5] = "0x";
+      
+                    uint32_t value = c;
+                    hexBuf[2] = s_hex[((value >> 4) & 0xf)];
+                    hexBuf[3] = s_hex[(value & 0xf)];
+                    hexBuf[4] = 0;
+
+                    buf << hexBuf;
+                }
+                else
+                {
+                    buf << c;
+                }
+
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::unexpectedCharacter);
+                return _setInvalidToken();
+            }
+        }
+    }
+}
+
+JSONLexer::LexResult JSONLexer::_lexNumber(const char* cursor)
+{
+    JSONTokenType tokenType = JSONTokenType::IntegerLiteral;
+
+    if (*cursor == '-')
+    {
+        cursor++;
+    }
+
+    if (*cursor == '0')
+    {
+        // Can only be followed by . exponent, or nothing
+        cursor++;
+    }
+    else if (*cursor >= '1' && *cursor <= '9')
+    {
+        cursor++;
+        while (CharUtil::isDigit(*cursor))
+        {
+            cursor++;
+        }
+    }
+
+    // Theres a fraction
+    if (*cursor == '.')
+    {
+        tokenType = JSONTokenType::FloatLiteral;
+        // Skip the dot
+        cursor++;
+        // Must have at least one digit
+        if (!CharUtil::isDigit(*cursor))
+        {
+            m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+            return LexResult{ JSONTokenType::Invalid, nullptr };
+        }
+        // Skip the digit
+        cursor++;
+        // Skip any more digits
+        while (CharUtil::isDigit(*cursor)) cursor++;
+    }
+
+    // Theres an exponent
+    if (*cursor == 'e' || *cursor == 'E')
+    {
+        tokenType = JSONTokenType::FloatLiteral;
+
+        // Has an exponent
+        cursor++;
+
+        // Skip +/- if has one
+        if (*cursor == '+' || *cursor == '-')
+        {
+            cursor++;
+        }
+
+        // Must have one digit
+        if (!CharUtil::isDigit(*cursor))
+        {
+            m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+            return LexResult{ JSONTokenType::Invalid, nullptr };
+        }
+
+        // Skip the digit
+        cursor++;
+        // Skip any more digits
+        while (CharUtil::isDigit(*cursor)) cursor++;
+    }
+
+    return LexResult{tokenType, cursor};
+}
+
+const char* JSONLexer::_lexString(const char* cursor)
+{
+    // We've skipped the first "
+    while (true)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:
+            {
+                m_sink->diagnose(_getLoc(cursor - 1), JSONDiagnostics::endOfFileInLiteral);
+                return nullptr;
+            }
+            case '"':
+            {
+                return cursor;
+            }
+            case '\\':
+            {
+                const char nextC = *cursor;
+                switch (nextC)
+                {
+                    case '"':
+                    case '\\':
+                    case '/':
+                    case 'b':
+                    case 'f':
+                    case 'n':
+                    case 'r':
+                    case 't':
+                    {
+                        ++cursor;
+                        break;
+                    }
+                    case 'u':
+                    {
+                        cursor++;
+                        for (Index i = 0; i < 4; ++i)
+                        {
+                            if (!CharUtil::isHexDigit(cursor[i]))
+                            {
+                                m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingAHexDigit);
+                                return nullptr;
+                            }
+                        }
+                        cursor += 4;
+                        break;
+                    }
+                }
+
+            }
+            // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+            default: break;
+        }
+    }
+}
+
+const char* JSONLexer::_lexLineComment(const char* cursor)
+{
+    for (;;)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case '\n':
+            case '\r':
+            {
+                // We need to skip to the next line
+                return _handleEndOfLine(c, cursor);
+            }
+            case 0:
+            {
+                return cursor - 1;
+            }
+        }
+    }
+}
+
+const char* JSONLexer::_lexBlockComment(const char* cursor)
+{
+    for (;;)
+    {
+        const char c = *cursor++;
+        switch (c)
+        {
+            case 0:
+            {
+                m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::endOfFileInComment);
+                return nullptr;
+            }
+            case '*':
+            {
+                if (*cursor == '/')
+                {
+                    return cursor + 1;
+                }
+                break;
+            }
+            default: break;
+        }
+    }
+}
+
+const char* JSONLexer::_lexWhitespace(const char* cursor)
+{
+    while (true)
+    {
+        const char c = *cursor;
+
+        // Might want to use CharUtil::isWhitespace...
+
+        switch (c)
+        {
+            case ' ':
+            case '\n':
+            case '\r':
+            case '\t':
+            {
+                cursor++;
+                break;
+            }
+            default:
+            {
+                // Hit non white space
+                return cursor;
+            }
+        }
+
+    }
+}
+
+} // namespace Slang
diff --git a/source/compiler-core/slang-json-lexer.h b/source/compiler-core/slang-json-lexer.h
new file mode 100644
index 000000000..03f16d445
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.h
@@ -0,0 +1,89 @@
+// slang-json-lexer.h
+#ifndef SLANG_JSON_LEXER_H
+#define SLANG_JSON_LEXER_H
+
+#include "../core/slang-basic.h"
+
+#include "slang-source-loc.h"
+#include "slang-diagnostic-sink.h"
+
+namespace Slang {
+
+enum class JSONTokenType
+{
+    Invalid,
+    IntegerLiteral,
+    FloatLiteral,
+    StringLiteral,
+    LBracket,
+    RBracket,
+    LBrace,
+    RBrace,
+    Comma,
+    Colon,
+    True,
+    False,
+    Null,
+    EndOfFile,
+    CountOf,
+};
+
+struct JSONToken
+{
+    JSONTokenType type;         ///< The token type 
+    SourceLoc loc;              ///< Location in the source file
+    uint32_t length;            ///< The length of the token in bytes
+};
+
+class JSONLexer
+{
+public:
+    JSONToken& peekToken() { return m_token; }
+    JSONTokenType peekType() { return m_token.type; }
+
+    JSONTokenType advance();
+
+    SlangResult init(SourceView* sourceView, DiagnosticSink* sink);
+
+protected:
+    struct LexResult
+    {
+        JSONTokenType type;
+        const char* cursor;
+    };
+
+        /// Get the location of the cursor
+    SLANG_FORCE_INLINE SourceLoc _getLoc(const char* cursor) const { return m_startLoc + (cursor - m_contentStart); }
+    const char* _lexLineComment(const char* cursor);
+    const char* _lexBlockComment(const char* cursor);
+    const char* _lexWhitespace(const char* cursor);
+    const char* _lexString(const char* cursor);
+    LexResult _lexNumber(const char* cursor);
+
+    SLANG_FORCE_INLINE JSONTokenType _setToken(JSONTokenType type, const char* cursor)
+    {
+        SLANG_ASSERT(cursor >= m_lexemeStart);
+        m_token.type = type;
+        m_token.loc = m_startLoc + (m_lexemeStart - m_contentStart);
+        m_token.length = uint32_t(cursor - m_lexemeStart);
+        m_cursor = cursor;
+        return type;
+    }
+    JSONTokenType _setInvalidToken();
+
+    JSONToken m_token;
+
+    const char* m_cursor;
+    const char* m_lexemeStart;
+
+    const char* m_contentStart;
+
+    SourceLoc m_startLoc;
+
+    SourceView* m_sourceView;
+    DiagnosticSink* m_sink;
+};
+
+} // namespace Slang
+
+#endif
author	jsmall-nvidia <jsmall@nvidia.com>	2021-05-25 20:58:43 -0400
committer	GitHub <noreply@github.com>	2021-05-25 20:58:43 -0400
commit	7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree	6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source/compiler-core
parent	89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)