JSON Lexing and string encoding/decoding (#1858)

* #include an absolute path didn't work - because paths were taken to always be relative. * WIP Json lexer. * Check JSON Lex with unit test * Add JSON escaping/unescaping of strings. * Big fix encoding/decoding. * Fix typo in JSON diagnostics. * Fix typo. * Better float testing.
author: jsmall-nvidia <jsmall@nvidia.com> 2021-05-25 20:58:43 -0400
committer: GitHub <noreply@github.com> 2021-05-25 20:58:43 -0400
commit: 7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree: 6613b13983083d16b8945c6d92b1f4f1d1fb2501
parent: 89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)
12 files changed, 1148 insertions, 0 deletions
diff --git a/build/visual-studio/compiler-core/compiler-core.vcxproj b/build/visual-studio/compiler-core/compiler-core.vcxproj
index e6883900c..7d2ac6afa 100644
--- a/build/visual-studio/compiler-core/compiler-core.vcxproj
+++ b/build/visual-studio/compiler-core/compiler-core.vcxproj
@@ -179,6 +179,9 @@
     <ClInclude Include="..\..\..\source\compiler-core\slang-gcc-compiler-util.h" />
     <ClInclude Include="..\..\..\source\compiler-core\slang-glslang-compiler.h" />
     <ClInclude Include="..\..\..\source\compiler-core\slang-include-system.h" />
+    <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostic-defs.h" />
+    <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostics.h" />
+    <ClInclude Include="..\..\..\source\compiler-core\slang-json-lexer.h" />
     <ClInclude Include="..\..\..\source\compiler-core\slang-lexer-diagnostic-defs.h" />
     <ClInclude Include="..\..\..\source\compiler-core\slang-lexer.h" />
     <ClInclude Include="..\..\..\source\compiler-core\slang-misc-diagnostic-defs.h" />
@@ -201,6 +204,8 @@
     <ClCompile Include="..\..\..\source\compiler-core\slang-gcc-compiler-util.cpp" />
     <ClCompile Include="..\..\..\source\compiler-core\slang-glslang-compiler.cpp" />
     <ClCompile Include="..\..\..\source\compiler-core\slang-include-system.cpp" />
+    <ClCompile Include="..\..\..\source\compiler-core\slang-json-diagnostics.cpp" />
+    <ClCompile Include="..\..\..\source\compiler-core\slang-json-lexer.cpp" />
     <ClCompile Include="..\..\..\source\compiler-core\slang-lexer.cpp" />
     <ClCompile Include="..\..\..\source\compiler-core\slang-name-convention-util.cpp" />
     <ClCompile Include="..\..\..\source\compiler-core\slang-name.cpp" />
diff --git a/build/visual-studio/compiler-core/compiler-core.vcxproj.filters b/build/visual-studio/compiler-core/compiler-core.vcxproj.filters
index d35f9941c..243eecd5b 100644
--- a/build/visual-studio/compiler-core/compiler-core.vcxproj.filters
+++ b/build/visual-studio/compiler-core/compiler-core.vcxproj.filters
@@ -36,6 +36,15 @@
     <ClInclude Include="..\..\..\source\compiler-core\slang-include-system.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostic-defs.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostics.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="..\..\..\source\compiler-core\slang-json-lexer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\..\source\compiler-core\slang-lexer-diagnostic-defs.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -98,6 +107,12 @@
     <ClCompile Include="..\..\..\source\compiler-core\slang-include-system.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\source\compiler-core\slang-json-diagnostics.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\..\source\compiler-core\slang-json-lexer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\source\compiler-core\slang-lexer.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/build/visual-studio/slang-test/slang-test.vcxproj b/build/visual-studio/slang-test/slang-test.vcxproj
index 24d2f2ac1..22a8aae5f 100644
--- a/build/visual-studio/slang-test/slang-test.vcxproj
+++ b/build/visual-studio/slang-test/slang-test.vcxproj
@@ -183,6 +183,7 @@
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-compression.cpp" />
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-find-type-by-name.cpp" />
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-free-list.cpp" />
+    <ClCompile Include="..\..\..\tools\slang-test\unit-test-json.cpp" />
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-memory-arena.cpp" />
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-path.cpp" />
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-riff.cpp" />
diff --git a/build/visual-studio/slang-test/slang-test.vcxproj.filters b/build/visual-studio/slang-test/slang-test.vcxproj.filters
index 9b7138eef..1e5b6e4af 100644
--- a/build/visual-studio/slang-test/slang-test.vcxproj.filters
+++ b/build/visual-studio/slang-test/slang-test.vcxproj.filters
@@ -68,6 +68,9 @@
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-free-list.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\tools\slang-test\unit-test-json.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\tools\slang-test\unit-test-memory-arena.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/source/compiler-core/slang-json-diagnostic-defs.h b/source/compiler-core/slang-json-diagnostic-defs.h
new file mode 100644
index 000000000..a4b260857
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostic-defs.h
@@ -0,0 +1,37 @@
+//
+
+// The file is meant to be included multiple times, to produce different
+// pieces of declaration/definition code related to diagnostic messages
+//
+// Each diagnostic is declared here with:
+//
+//     DIAGNOSTIC(id, severity, name, messageFormat)
+//
+// Where `id` is the unique diagnostic ID, `severity` is the default
+// severity (from the `Severity` enum), `name` is a name used to refer
+// to this diagnostic from code, and `messageFormat` is the default
+// (non-localized) message for the diagnostic, with placeholders
+// for any arguments.
+
+#ifndef DIAGNOSTIC
+#error Need to #define DIAGNOSTIC(...) before including 
+#define DIAGNOSTIC(id, severity, name, messageFormat) /* */
+#endif
+
+//
+// -1 - Notes that decorate another diagnostic.
+//
+
+//
+// 2xxxx - JSON Lexical analysis
+//
+
+DIAGNOSTIC(20000, Error, unexpectedCharacter, "unexpected character '$0'")
+DIAGNOSTIC(20001, Error, endOfFileInLiteral, "end of file in literal")
+DIAGNOSTIC(20002, Error, newlineInLiteral, "newline in literal")
+DIAGNOSTIC(20003, Error, endOfFileInComment, "end of file in comment")
+DIAGNOSTIC(20004, Error, expectingAHexDigit, "expecting a hex digit")
+DIAGNOSTIC(20005, Error, expectingADigit, "expecting a digit")
+DIAGNOSTIC(20006, Error, expectingValueName, "expecting value name [null, true, false]")
+
+#undef DIAGNOSTIC
diff --git a/source/compiler-core/slang-json-diagnostics.cpp b/source/compiler-core/slang-json-diagnostics.cpp
new file mode 100644
index 000000000..1d35e8faf
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostics.cpp
@@ -0,0 +1,33 @@
+// slang-json-diagnostics.cpp
+#include "slang-json-diagnostics.h"
+
+namespace Slang {
+
+namespace JSONDiagnostics
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) const DiagnosticInfo name = { id, Severity::severity, #name, messageFormat };
+#include "slang-json-diagnostic-defs.h"
+#undef DIAGNOSTIC
+}
+
+static const DiagnosticInfo* const kJSONDiagnostics[] =
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) &JSONDiagnostics::name, 
+#include "slang-json-diagnostic-defs.h"
+#undef DIAGNOSTIC
+};
+
+static DiagnosticsLookup* _newJSONDiagnosticsLookup()
+{
+    auto lookup = new DiagnosticsLookup;
+    lookup->add(kJSONDiagnostics, SLANG_COUNT_OF(kJSONDiagnostics));
+    return lookup;
+}
+
+DiagnosticsLookup* getJSONDiagnosticsLookup()
+{
+    static RefPtr<DiagnosticsLookup> s_lookup = _newJSONDiagnosticsLookup();
+    return s_lookup;
+}
+
+} // namespace Slang
diff --git a/source/compiler-core/slang-json-diagnostics.h b/source/compiler-core/slang-json-diagnostics.h
new file mode 100644
index 000000000..88ec0c550
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostics.h
@@ -0,0 +1,26 @@
+#ifndef SLANG_JSON_DIAGNOSTICS_H
+#define SLANG_JSON_DIAGNOSTICS_H
+
+#include "../core/slang-basic.h"
+#include "../core/slang-writer.h"
+
+#include "slang-source-loc.h"
+#include "slang-diagnostic-sink.h"
+#include "slang-token.h"
+
+#include "../../slang.h"
+
+namespace Slang
+{
+
+DiagnosticsLookup* getJSONDiagnosticsLookup();
+
+namespace JSONDiagnostics
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) extern const DiagnosticInfo name;
+#include "slang-json-diagnostic-defs.h"
+}
+
+}
+
+#endif
diff --git a/source/compiler-core/slang-json-lexer.cpp b/source/compiler-core/slang-json-lexer.cpp
new file mode 100644
index 000000000..19a5b29a7
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.cpp
@@ -0,0 +1,385 @@
+// slang-json-lexer.cpp
+#include "slang-json-lexer.h"
+
+#include "slang-json-diagnostics.h"
+#include "../core/slang-char-util.h"
+
+/*
+https://www.json.org/json-en.html
+*/
+
+namespace Slang {
+
+SlangResult JSONLexer::init(SourceView* sourceView, DiagnosticSink* sink) 
+{
+    m_sourceView = sourceView;
+    m_sink = sink;
+
+    SourceFile* sourceFile = sourceView->getSourceFile();
+
+    // Note that the content must be null terminated (because of other requirements)
+    SLANG_ASSERT(sourceFile && sourceFile->hasContent());
+
+    m_contentStart = sourceFile->getContent().begin();
+
+    m_startLoc = sourceView->getRange().begin;
+
+    m_lexemeStart = m_contentStart;
+    m_cursor = m_lexemeStart;
+
+    // We need to prime the first token
+    advance();
+
+    return SLANG_OK;
+}
+
+SLANG_FORCE_INLINE static const char* _handleEndOfLine(char c, const char* cursor)
+{
+    SLANG_ASSERT(c == '\n' || c == '\r');
+    const char d = *cursor;
+    return cursor + Index((c ^ d) == ('\n' ^ '\r'));
+}
+
+JSONTokenType JSONLexer::_setInvalidToken()
+{
+    return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+}
+
+JSONTokenType JSONLexer::advance()
+{
+    const char* cursor = m_cursor;
+
+    while (true)
+    {
+        m_lexemeStart = cursor;
+
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:     return _setToken(JSONTokenType::EndOfFile, cursor - 1);
+            case '"':
+            {
+                cursor = _lexString(cursor);
+                if (cursor == nullptr)
+                {
+                    return _setInvalidToken();
+                }
+                return _setToken(JSONTokenType::StringLiteral, cursor);
+            }
+            case '/':
+            {
+                // We allow comments
+                const char nextChar = *m_cursor;
+
+                if (nextChar == '/')
+                {
+                    // Line comment
+                    cursor = _lexLineComment(cursor);
+                    break;
+                }
+                else if (nextChar == '*')
+                {
+                    cursor = _lexBlockComment(cursor);
+                    // Can fail... 
+                    if (cursor == nullptr)
+                    {
+                        return _setInvalidToken();
+                    }
+                    break;
+                }
+            }
+            case ' ':
+            case '\t':
+            case '\n':
+            case '\r':
+            {
+                cursor = _lexWhitespace(cursor);
+                break;
+            }
+            case ':':           return _setToken(JSONTokenType::Colon, cursor);
+            case ',':           return _setToken(JSONTokenType::Comma, cursor);
+            case '[':           return _setToken(JSONTokenType::LBracket, cursor);
+            case ']':           return _setToken(JSONTokenType::RBracket, cursor);
+            case '{':           return _setToken(JSONTokenType::LBrace, cursor);
+            case '}':           return _setToken(JSONTokenType::RBrace, cursor);
+
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                LexResult res = _lexNumber(cursor - 1);
+                if (res.cursor == nullptr)
+                {
+                    return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+                }
+                return _setToken(res.type, res.cursor);
+            }
+            case 't':
+            {
+                if (cursor[0] == 'r' && cursor[1] == 'u' && cursor[2] == 'e')
+                {
+                    return _setToken(JSONTokenType::True, cursor + 3);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            case 'f':
+            {
+                if (cursor[0] == 'a' && cursor[1] == 'l' && cursor[2] == 's' && cursor[3] == 'e')
+                {
+                    return _setToken(JSONTokenType::False, cursor + 4);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            case 'n':
+            {
+                if (cursor[0] == 'u' && cursor[1] == 'l' && cursor[2] == 'l')
+                {
+                    return _setToken(JSONTokenType::Null, cursor + 3);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            default:
+            {
+                StringBuilder buf;
+                if (c <= ' ' || c >= 0x7e)
+                {
+                    static const char s_hex[] = "012345679abcdef";
+
+                    char hexBuf[5] = "0x";
+      
+                    uint32_t value = c;
+                    hexBuf[2] = s_hex[((value >> 4) & 0xf)];
+                    hexBuf[3] = s_hex[(value & 0xf)];
+                    hexBuf[4] = 0;
+
+                    buf << hexBuf;
+                }
+                else
+                {
+                    buf << c;
+                }
+
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::unexpectedCharacter);
+                return _setInvalidToken();
+            }
+        }
+    }
+}
+
+JSONLexer::LexResult JSONLexer::_lexNumber(const char* cursor)
+{
+    JSONTokenType tokenType = JSONTokenType::IntegerLiteral;
+
+    if (*cursor == '-')
+    {
+        cursor++;
+    }
+
+    if (*cursor == '0')
+    {
+        // Can only be followed by . exponent, or nothing
+        cursor++;
+    }
+    else if (*cursor >= '1' && *cursor <= '9')
+    {
+        cursor++;
+        while (CharUtil::isDigit(*cursor))
+        {
+            cursor++;
+        }
+    }
+
+    // Theres a fraction
+    if (*cursor == '.')
+    {
+        tokenType = JSONTokenType::FloatLiteral;
+        // Skip the dot
+        cursor++;
+        // Must have at least one digit
+        if (!CharUtil::isDigit(*cursor))
+        {
+            m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+            return LexResult{ JSONTokenType::Invalid, nullptr };
+        }
+        // Skip the digit
+        cursor++;
+        // Skip any more digits
+        while (CharUtil::isDigit(*cursor)) cursor++;
+    }
+
+    // Theres an exponent
+    if (*cursor == 'e' || *cursor == 'E')
+    {
+        tokenType = JSONTokenType::FloatLiteral;
+
+        // Has an exponent
+        cursor++;
+
+        // Skip +/- if has one
+        if (*cursor == '+' || *cursor == '-')
+        {
+            cursor++;
+        }
+
+        // Must have one digit
+        if (!CharUtil::isDigit(*cursor))
+        {
+            m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+            return LexResult{ JSONTokenType::Invalid, nullptr };
+        }
+
+        // Skip the digit
+        cursor++;
+        // Skip any more digits
+        while (CharUtil::isDigit(*cursor)) cursor++;
+    }
+
+    return LexResult{tokenType, cursor};
+}
+
+const char* JSONLexer::_lexString(const char* cursor)
+{
+    // We've skipped the first "
+    while (true)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:
+            {
+                m_sink->diagnose(_getLoc(cursor - 1), JSONDiagnostics::endOfFileInLiteral);
+                return nullptr;
+            }
+            case '"':
+            {
+                return cursor;
+            }
+            case '\\':
+            {
+                const char nextC = *cursor;
+                switch (nextC)
+                {
+                    case '"':
+                    case '\\':
+                    case '/':
+                    case 'b':
+                    case 'f':
+                    case 'n':
+                    case 'r':
+                    case 't':
+                    {
+                        ++cursor;
+                        break;
+                    }
+                    case 'u':
+                    {
+                        cursor++;
+                        for (Index i = 0; i < 4; ++i)
+                        {
+                            if (!CharUtil::isHexDigit(cursor[i]))
+                            {
+                                m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingAHexDigit);
+                                return nullptr;
+                            }
+                        }
+                        cursor += 4;
+                        break;
+                    }
+                }
+
+            }
+            // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+            default: break;
+        }
+    }
+}
+
+const char* JSONLexer::_lexLineComment(const char* cursor)
+{
+    for (;;)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case '\n':
+            case '\r':
+            {
+                // We need to skip to the next line
+                return _handleEndOfLine(c, cursor);
+            }
+            case 0:
+            {
+                return cursor - 1;
+            }
+        }
+    }
+}
+
+const char* JSONLexer::_lexBlockComment(const char* cursor)
+{
+    for (;;)
+    {
+        const char c = *cursor++;
+        switch (c)
+        {
+            case 0:
+            {
+                m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::endOfFileInComment);
+                return nullptr;
+            }
+            case '*':
+            {
+                if (*cursor == '/')
+                {
+                    return cursor + 1;
+                }
+                break;
+            }
+            default: break;
+        }
+    }
+}
+
+const char* JSONLexer::_lexWhitespace(const char* cursor)
+{
+    while (true)
+    {
+        const char c = *cursor;
+
+        // Might want to use CharUtil::isWhitespace...
+
+        switch (c)
+        {
+            case ' ':
+            case '\n':
+            case '\r':
+            case '\t':
+            {
+                cursor++;
+                break;
+            }
+            default:
+            {
+                // Hit non white space
+                return cursor;
+            }
+        }
+
+    }
+}
+
+} // namespace Slang
diff --git a/source/compiler-core/slang-json-lexer.h b/source/compiler-core/slang-json-lexer.h
new file mode 100644
index 000000000..03f16d445
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.h
@@ -0,0 +1,89 @@
+// slang-json-lexer.h
+#ifndef SLANG_JSON_LEXER_H
+#define SLANG_JSON_LEXER_H
+
+#include "../core/slang-basic.h"
+
+#include "slang-source-loc.h"
+#include "slang-diagnostic-sink.h"
+
+namespace Slang {
+
+enum class JSONTokenType
+{
+    Invalid,
+    IntegerLiteral,
+    FloatLiteral,
+    StringLiteral,
+    LBracket,
+    RBracket,
+    LBrace,
+    RBrace,
+    Comma,
+    Colon,
+    True,
+    False,
+    Null,
+    EndOfFile,
+    CountOf,
+};
+
+struct JSONToken
+{
+    JSONTokenType type;         ///< The token type 
+    SourceLoc loc;              ///< Location in the source file
+    uint32_t length;            ///< The length of the token in bytes
+};
+
+class JSONLexer
+{
+public:
+    JSONToken& peekToken() { return m_token; }
+    JSONTokenType peekType() { return m_token.type; }
+
+    JSONTokenType advance();
+
+    SlangResult init(SourceView* sourceView, DiagnosticSink* sink);
+
+protected:
+    struct LexResult
+    {
+        JSONTokenType type;
+        const char* cursor;
+    };
+
+        /// Get the location of the cursor
+    SLANG_FORCE_INLINE SourceLoc _getLoc(const char* cursor) const { return m_startLoc + (cursor - m_contentStart); }
+    const char* _lexLineComment(const char* cursor);
+    const char* _lexBlockComment(const char* cursor);
+    const char* _lexWhitespace(const char* cursor);
+    const char* _lexString(const char* cursor);
+    LexResult _lexNumber(const char* cursor);
+
+    SLANG_FORCE_INLINE JSONTokenType _setToken(JSONTokenType type, const char* cursor)
+    {
+        SLANG_ASSERT(cursor >= m_lexemeStart);
+        m_token.type = type;
+        m_token.loc = m_startLoc + (m_lexemeStart - m_contentStart);
+        m_token.length = uint32_t(cursor - m_lexemeStart);
+        m_cursor = cursor;
+        return type;
+    }
+    JSONTokenType _setInvalidToken();
+
+    JSONToken m_token;
+
+    const char* m_cursor;
+    const char* m_lexemeStart;
+
+    const char* m_contentStart;
+
+    SourceLoc m_startLoc;
+
+    SourceView* m_sourceView;
+    DiagnosticSink* m_sink;
+};
+
+} // namespace Slang
+
+#endif
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 5e4db269c..a91d88e05 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -87,6 +87,8 @@ SlangResult SpaceStringEscapeHandler::appendEscaped(const UnownedStringSlice& sl
     }
 }
 
+
+
 // !!!!!!!!!!!!!!!!!!!!!!!!!! CppStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 class CppStringEscapeHandler : public StringEscapeHandler
@@ -445,10 +447,380 @@ SlangResult CppStringEscapeHandler::lexQuoted(const char* cursor, const char** o
     }
 }
 
+// !!!!!!!!!!!!!!!!!!!!!!!!!! JSONStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+class JSONStringEscapeHandler : public StringEscapeHandler
+{
+public:
+    typedef StringEscapeHandler Super;
+
+    virtual bool isQuotingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE { SLANG_UNUSED(slice); return true; }
+    virtual bool isEscapingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE;
+    virtual SlangResult appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE;
+    virtual SlangResult appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE;
+    virtual SlangResult lexQuoted(const char* cursor, const char** outCursor) SLANG_OVERRIDE;
+
+    JSONStringEscapeHandler() : Super('"') {}
+};
+
+bool JSONStringEscapeHandler::isEscapingNeeded(const UnownedStringSlice& slice)
+{
+    const char* cur = slice.begin();
+    const char*const end = slice.end();
+
+    for (; cur < end; ++cur)
+    {
+        const char c = *cur;
+
+        switch (c)
+        {
+            case '\"':
+            case '\\':
+            case '/':
+            {
+                return true;
+            }
+            default:
+            {
+                if (c < ' ' || c >= 0x7e)
+                {
+                    return true;
+                }
+                break;
+            }
+        }
+    }
+    return false;
+}
+
+SlangResult JSONStringEscapeHandler::lexQuoted(const char* cursor, const char** outCursor)
+{
+    // We've skipped the first "
+    while (true)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:     return SLANG_FAIL;
+            case '"':
+            {
+                *outCursor = cursor;
+                return SLANG_OK;
+            }
+            case '\\':
+            {
+                const char nextC = *cursor;
+                switch (nextC)
+                {
+                    case '"':
+                    case '\\':
+                    case '/':
+                    case 'b':
+                    case 'f':
+                    case 'n':
+                    case 'r':
+                    case 't':
+                    {
+                        ++cursor;
+                        break;
+                    }
+                    case 'u':
+                    {
+                        cursor++;
+                        for (Index i = 0; i < 4; ++i)
+                        {
+                            if (!CharUtil::isHexDigit(cursor[i]))
+                            {
+                                return SLANG_FAIL;
+                            }
+                        }
+                        cursor += 4;
+                        break;
+                    }
+                }
+            }
+            // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+            default: break;
+        }
+    }
+}
+
+static char _getJSONEscapedChar(char c)
+{
+    switch (c)
+    {
+        case '\b':      return 'b';
+        case '\f':      return 'f';
+        case '\n':      return 'n';
+        case '\r':      return 'r';
+        case '\t':      return 't';
+        case '\\':      return '\\';
+        case '/':       return '/';
+        case '"':       return '"';
+        default:        return 0;
+    }
+}
+
+static char _getJSONUnescapedChar(char c)
+{
+    switch (c)
+    {
+        case 'b':      return '\b';
+        case 'f':      return '\f';
+        case 'n':      return '\n';
+        case 'r':      return '\r';
+        case 't':      return '\t';
+        case '\\':      return '\\';
+        case '/':       return '/';
+        case '"':       return '"';
+        default:        return 0;
+    }
+}
+
+static const char s_hex[] = "0123456789abcdef";
+
+// Outputs ioSlice with the chars remaining after utf8 encoded value
+// Returns ~uint32_t(0) if can't decode
+static uint32_t _getUnicodePointFromUTF8(UnownedStringSlice& ioSlice)
+{
+    const Index length = ioSlice.getLength();
+    SLANG_ASSERT(length > 0);
+    const char* cur = ioSlice.begin();
+
+    uint32_t codePoint = 0;
+    unsigned int leading = cur[0];
+    unsigned int mask = 0x80;
+
+    Index count = 0;
+    while (leading & mask)
+    {
+        count++;
+        mask >>= 1;
+    }
+
+    if (count > length)
+    {
+        SLANG_ASSERT(!"Can't decode");
+        ioSlice = UnownedStringSlice(ioSlice.end(), ioSlice.end());
+        return ~uint32_t(0);
+    }
+
+    codePoint = (leading & (mask - 1));
+    for (Index i = 1; i <= count - 1; i++)
+    {
+        codePoint <<= 6;
+        codePoint += (cur[i] & 0x3F);
+    }
+
+    ioSlice = UnownedStringSlice(cur + count, ioSlice.end());
+    return codePoint;
+}
+
+static void _appendHex16(uint32_t value, StringBuilder& out)
+{
+    // Let's go with hex
+    char buf[] = "\\u0000";
+
+    buf[2] = s_hex[(value >> 12) & 0xf];
+    buf[3] = s_hex[(value >> 8) & 0xf];
+    buf[4] = s_hex[(value >> 4) & 0xf];
+    buf[5] = s_hex[(value >> 0) & 0xf];
+
+    out.append(UnownedStringSlice(buf, 6));
+}
+
+SlangResult JSONStringEscapeHandler::appendEscaped(const UnownedStringSlice& slice, StringBuilder& out)
+{
+    const char* start = slice.begin();
+    const char* cur = start;
+    const char*const end = slice.end();
+
+    for (; cur < end; ++cur)
+    {
+        const char c = *cur;
+        
+        const char escapedChar = _getJSONEscapedChar(c);
+
+        if (escapedChar)
+        {
+            // Flush
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+            out.appendChar('\\');
+            out.appendChar(escapedChar);
+
+            start = cur + 1;
+        }
+        else if (uint8_t(c) & 0x80)
+        {
+            // Flush
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+
+            // UTF8
+            UnownedStringSlice remainingSlice(cur, end);
+            uint32_t codePoint = _getUnicodePointFromUTF8(remainingSlice);
+
+            // We only support up to 16 bit unicode values for now...
+            SLANG_ASSERT(codePoint < 0x10000);
+
+            _appendHex16(codePoint, out);
+
+            cur = remainingSlice.begin() - 1;
+            start = cur + 1;
+        }
+        else if (uint8_t(c) < ' ' || (c >= 0x7e))
+        {
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+
+            _appendHex16(uint32_t(c), out);
+
+            start = cur + 1;
+        }
+        else
+        {
+            // Can go out as it is
+        }
+    }
+
+    // Flush at the end
+    if (start < end)
+    {
+        out.append(start, end);
+    }
+    return SLANG_OK;
+}
+
+SlangResult JSONStringEscapeHandler::appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out)
+{
+    const char* start = slice.begin();
+    const char* cur = start;
+    const char*const end = slice.end();
+
+    for (; cur < end; ++cur)
+    {
+        const char c = *cur;
+
+        if (c == '\\')
+        {
+            // Flush
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+
+            /// Next 
+            cur++;
+
+            if (cur >= end)
+            {
+                return SLANG_FAIL;
+            }
+
+            // Need to handle various escape sequence cases
+            switch (*cur)
+            {
+                case '\"':
+                case '\\':
+                case '/':
+                case 'b':
+                case 'f':
+                case 'n':
+                case 'r':
+                case 't':
+                {
+                    const char unescapedChar = _getJSONUnescapedChar(*cur);
+                    if (unescapedChar == 0)
+                    {
+                        // Don't know how to unescape that char
+                        return SLANG_FAIL;
+                    }
+                    out.appendChar(unescapedChar);
+
+                    start = cur + 1;
+                    break;
+                }
+                case 'u':
+                {
+                    uint32_t value = 0;
+                    cur++;
+
+                    if (cur + 4 > end)
+                    {
+                        return SLANG_FAIL;
+                    }
+
+                    for (Index i = 0; i < 4; ++i)
+                    {
+                        const char digitC = cur[i];
+
+                        uint32_t digitValue;
+                        if (digitC >= '0' && digitC <= '9')
+                        {
+                            digitValue = digitC - '0';
+                        }
+                        else if (digitC >= 'a' && digitC <= 'f')
+                        {
+                            digitValue = digitC -'a' + 10;
+                        }
+                        else if(digitC >= 'A' && digitC <= 'F')
+                        {
+                            digitValue = digitC - 'A' + 10;
+                        }
+                        else
+                        {
+                            return SLANG_FAIL;
+                        }
+                        SLANG_ASSERT(digitValue < 0x10);
+                        value = (value << 4) | digitValue;
+                    }
+                    cur += 4;
+
+                    // NOTE! Strictly speaking we may want to combine 2 UTF16 surrogates to make a single
+                    // UTF8 encoded char.
+                    
+                    // Need to encode in UTF8 to concat
+
+                    char buf[8];
+                    int len = EncodeUnicodePointToUTF8(buf, value);
+
+                    out.append(buf, buf + len);
+
+                    start = cur;
+                    cur--;
+                    break;
+                }
+                default:
+                {
+                    // Can't decode
+                    return SLANG_FAIL;
+                }
+            }
+        }
+    }
+
+    // Flush
+    if (start < end)
+    {
+        out.append(start, end);
+    }
+
+    return SLANG_OK;
+}
+
 // !!!!!!!!!!!!!!!!!!!!!!!!!! StringEscapeUtil !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 static CppStringEscapeHandler g_cppHandler;
 static SpaceStringEscapeHandler g_spaceHandler;
+static JSONStringEscapeHandler g_jsonHandler;
 
 StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
 {
@@ -456,6 +828,7 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
     {
         case Style::Cpp:    return &g_cppHandler;
         case Style::Space:  return &g_spaceHandler;
+        case Style::JSON:   return &g_jsonHandler;
         default:            return nullptr;
     }
 }
diff --git a/source/core/slang-string-escape-util.h b/source/core/slang-string-escape-util.h
index 9dc653df3..c3039eb47 100644
--- a/source/core/slang-string-escape-util.h
+++ b/source/core/slang-string-escape-util.h
@@ -51,6 +51,7 @@ struct StringEscapeUtil
     {
         Cpp,            ///< Cpp style quoting and escape handling
         Space,          ///< Applies quotes if there are spaces. Does not escape.
+        JSON,           ///< Json encoding
     };
 
         /// Given a style returns a handler
diff --git a/tools/slang-test/unit-test-json.cpp b/tools/slang-test/unit-test-json.cpp
new file mode 100644
index 000000000..fff16b136
--- /dev/null
+++ b/tools/slang-test/unit-test-json.cpp
@@ -0,0 +1,180 @@
+
+#include "../../source/compiler-core/slang-json-lexer.h"
+#include "../../source/core/slang-string-escape-util.h"
+
+#include "test-context.h"
+
+using namespace Slang;
+
+namespace { // anonymous
+
+struct Element
+{
+    JSONTokenType type;
+    const char* value;
+};
+
+} // anonymous
+
+static SlangResult _lex(const char* in, DiagnosticSink* sink, List<JSONToken>& toks)
+{
+    SourceManager* sourceManager = sink->getSourceManager();
+
+    String contents(in);
+    SourceFile* sourceFile = sourceManager->createSourceFileWithString(PathInfo::makeUnknown(), contents);
+    SourceView* sourceView = sourceManager->createSourceView(sourceFile, nullptr, SourceLoc());
+
+    JSONLexer lexer;
+
+    lexer.init(sourceView, sink);
+
+    while (lexer.peekType() != JSONTokenType::EndOfFile)
+    {
+        if (lexer.peekType() == JSONTokenType::Invalid)
+        {
+            toks.add(lexer.peekToken());
+            return SLANG_FAIL;
+        }
+
+        toks.add(lexer.peekToken());
+        lexer.advance();
+    }
+
+    toks.add(lexer.peekToken());
+
+    // If we advance from end of file we should still be at EndOfFile
+    SLANG_ASSERT(lexer.advance() == JSONTokenType::EndOfFile);
+
+    return SLANG_OK;
+}
+
+static bool _areEqual(SourceManager* sourceManager, const List<JSONToken>& toks, const Element* eles, Index elesCount)
+{
+    if (toks.getCount() != elesCount)
+    {
+        return false;
+    }
+
+    SourceView* sourceView = toks.getCount() ? sourceManager->findSourceView(toks[0].loc) : nullptr;
+    const char*const content = sourceView ? sourceView->getContent().begin() : nullptr;
+
+    for (Index i = 0; i < toks.getCount(); ++i)
+    {
+        const JSONToken& tok = toks[i];
+        const auto& ele = eles[i];
+
+        if (tok.type != ele.type)
+        {
+            return false;
+        }
+
+        SLANG_ASSERT(sourceView->getRange().contains(tok.loc));
+
+        const char* start = content + sourceView->getRange().getOffset(tok.loc);
+
+        UnownedStringSlice lexeme(start, tok.length);
+
+        if (lexeme != ele.value)
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void jsonUnitTest()
+{
+    SourceManager sourceManager;
+    sourceManager.initialize(nullptr, nullptr);
+    DiagnosticSink sink(&sourceManager, nullptr);
+
+    {
+        const char text[] = " { \"Hello\" : [ \"World\", 1, 2.0, -3.0, -435.5345435, 45e-10, 421.00e+20, 17e1] }";
+
+        const Element eles[] =
+        {
+            {JSONTokenType::LBrace, "{" },
+            {JSONTokenType::StringLiteral, "\"Hello\""},
+            {JSONTokenType::Colon, ":" },
+            {JSONTokenType::LBracket, "[" },
+            {JSONTokenType::StringLiteral, "\"World\"" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::IntegerLiteral, "1" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::FloatLiteral, "2.0" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::FloatLiteral, "-3.0" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::FloatLiteral, "-435.5345435" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::FloatLiteral, "45e-10" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::FloatLiteral, "421.00e+20" },
+            {JSONTokenType::Comma, "," },
+            {JSONTokenType::FloatLiteral, "17e1" },
+            {JSONTokenType::RBracket, "]" },
+            {JSONTokenType::RBrace, "}" },
+            {JSONTokenType::EndOfFile, "" },
+        };
+
+        List<JSONToken> toks;
+        SLANG_CHECK(SLANG_SUCCEEDED(_lex(text, &sink, toks)));
+
+        SLANG_CHECK(_areEqual(&sourceManager, toks, eles, SLANG_COUNT_OF(eles)));
+    }
+
+    {
+        StringEscapeHandler* handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::JSON);
+
+        
+        {
+            const auto slice = UnownedStringSlice::fromLiteral("\n\r\b\f\t \"\\/ Some text...");
+
+            SLANG_CHECK(handler->isEscapingNeeded(slice));
+            SLANG_CHECK(!handler->isEscapingNeeded(UnownedStringSlice::fromLiteral("Hello!")));
+
+            StringBuilder escaped;
+            handler->appendEscaped(slice, escaped);
+
+            StringBuilder unescaped;
+            handler->appendUnescaped(escaped.getUnownedSlice(), unescaped);
+
+            SLANG_CHECK(unescaped == slice);
+        }
+
+        {
+            uint32_t v = 0x7f;
+
+            StringBuilder buf;
+            while (v < 0x10000)
+            {
+                char work[10] = "\\u";
+
+                for (Int i = 0; i < 4; ++i)
+                {
+                    const uint32_t digitValue = (v >> ((3 - i) * 4)) & 0xf;
+
+                    char digitC = (digitValue > 9) ? char(digitValue - 10 + 'a') : char(digitValue + '0');
+                    work[i + 2] =  digitC;
+                }
+
+                buf << UnownedStringSlice(work, 6);
+
+                v += v;
+            }
+
+            // Decode it
+            StringBuilder unescaped;
+            handler->appendUnescaped(buf.getUnownedSlice(), unescaped);
+
+            // Encode it
+            StringBuilder escaped;
+            handler->appendEscaped(unescaped.getUnownedSlice(), escaped);
+
+            SLANG_CHECK(escaped == buf);
+        }
+    }
+}
+
+SLANG_UNIT_TEST("JSON", jsonUnitTest);
author	jsmall-nvidia <jsmall@nvidia.com>	2021-05-25 20:58:43 -0400
committer	GitHub <noreply@github.com>	2021-05-25 20:58:43 -0400
commit	7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree	6613b13983083d16b8945c6d92b1f4f1d1fb2501
parent	89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)