JSON Lexing and string encoding/decoding (#1858)

* #include an absolute path didn't work - because paths were taken to always be relative. * WIP Json lexer. * Check JSON Lex with unit test * Add JSON escaping/unescaping of strings. * Big fix encoding/decoding. * Fix typo in JSON diagnostics. * Fix typo. * Better float testing.
author: jsmall-nvidia <jsmall@nvidia.com> 2021-05-25 20:58:43 -0400
committer: GitHub <noreply@github.com> 2021-05-25 20:58:43 -0400
commit: 7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree: 6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source
parent: 89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)
7 files changed, 944 insertions, 0 deletions
diff --git a/source/compiler-core/slang-json-diagnostic-defs.h b/source/compiler-core/slang-json-diagnostic-defs.h
new file mode 100644
index 000000000..a4b260857
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostic-defs.h
@@ -0,0 +1,37 @@
+//
+
+// The file is meant to be included multiple times, to produce different
+// pieces of declaration/definition code related to diagnostic messages
+//
+// Each diagnostic is declared here with:
+//
+//     DIAGNOSTIC(id, severity, name, messageFormat)
+//
+// Where `id` is the unique diagnostic ID, `severity` is the default
+// severity (from the `Severity` enum), `name` is a name used to refer
+// to this diagnostic from code, and `messageFormat` is the default
+// (non-localized) message for the diagnostic, with placeholders
+// for any arguments.
+
+#ifndef DIAGNOSTIC
+#error Need to #define DIAGNOSTIC(...) before including 
+#define DIAGNOSTIC(id, severity, name, messageFormat) /* */
+#endif
+
+//
+// -1 - Notes that decorate another diagnostic.
+//
+
+//
+// 2xxxx - JSON Lexical analysis
+//
+
+DIAGNOSTIC(20000, Error, unexpectedCharacter, "unexpected character '$0'")
+DIAGNOSTIC(20001, Error, endOfFileInLiteral, "end of file in literal")
+DIAGNOSTIC(20002, Error, newlineInLiteral, "newline in literal")
+DIAGNOSTIC(20003, Error, endOfFileInComment, "end of file in comment")
+DIAGNOSTIC(20004, Error, expectingAHexDigit, "expecting a hex digit")
+DIAGNOSTIC(20005, Error, expectingADigit, "expecting a digit")
+DIAGNOSTIC(20006, Error, expectingValueName, "expecting value name [null, true, false]")
+
+#undef DIAGNOSTIC
diff --git a/source/compiler-core/slang-json-diagnostics.cpp b/source/compiler-core/slang-json-diagnostics.cpp
new file mode 100644
index 000000000..1d35e8faf
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostics.cpp
@@ -0,0 +1,33 @@
+// slang-json-diagnostics.cpp
+#include "slang-json-diagnostics.h"
+
+namespace Slang {
+
+namespace JSONDiagnostics
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) const DiagnosticInfo name = { id, Severity::severity, #name, messageFormat };
+#include "slang-json-diagnostic-defs.h"
+#undef DIAGNOSTIC
+}
+
+static const DiagnosticInfo* const kJSONDiagnostics[] =
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) &JSONDiagnostics::name, 
+#include "slang-json-diagnostic-defs.h"
+#undef DIAGNOSTIC
+};
+
+static DiagnosticsLookup* _newJSONDiagnosticsLookup()
+{
+    auto lookup = new DiagnosticsLookup;
+    lookup->add(kJSONDiagnostics, SLANG_COUNT_OF(kJSONDiagnostics));
+    return lookup;
+}
+
+DiagnosticsLookup* getJSONDiagnosticsLookup()
+{
+    static RefPtr<DiagnosticsLookup> s_lookup = _newJSONDiagnosticsLookup();
+    return s_lookup;
+}
+
+} // namespace Slang
diff --git a/source/compiler-core/slang-json-diagnostics.h b/source/compiler-core/slang-json-diagnostics.h
new file mode 100644
index 000000000..88ec0c550
--- /dev/null
+++ b/source/compiler-core/slang-json-diagnostics.h
@@ -0,0 +1,26 @@
+#ifndef SLANG_JSON_DIAGNOSTICS_H
+#define SLANG_JSON_DIAGNOSTICS_H
+
+#include "../core/slang-basic.h"
+#include "../core/slang-writer.h"
+
+#include "slang-source-loc.h"
+#include "slang-diagnostic-sink.h"
+#include "slang-token.h"
+
+#include "../../slang.h"
+
+namespace Slang
+{
+
+DiagnosticsLookup* getJSONDiagnosticsLookup();
+
+namespace JSONDiagnostics
+{
+#define DIAGNOSTIC(id, severity, name, messageFormat) extern const DiagnosticInfo name;
+#include "slang-json-diagnostic-defs.h"
+}
+
+}
+
+#endif
diff --git a/source/compiler-core/slang-json-lexer.cpp b/source/compiler-core/slang-json-lexer.cpp
new file mode 100644
index 000000000..19a5b29a7
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.cpp
@@ -0,0 +1,385 @@
+// slang-json-lexer.cpp
+#include "slang-json-lexer.h"
+
+#include "slang-json-diagnostics.h"
+#include "../core/slang-char-util.h"
+
+/*
+https://www.json.org/json-en.html
+*/
+
+namespace Slang {
+
+SlangResult JSONLexer::init(SourceView* sourceView, DiagnosticSink* sink) 
+{
+    m_sourceView = sourceView;
+    m_sink = sink;
+
+    SourceFile* sourceFile = sourceView->getSourceFile();
+
+    // Note that the content must be null terminated (because of other requirements)
+    SLANG_ASSERT(sourceFile && sourceFile->hasContent());
+
+    m_contentStart = sourceFile->getContent().begin();
+
+    m_startLoc = sourceView->getRange().begin;
+
+    m_lexemeStart = m_contentStart;
+    m_cursor = m_lexemeStart;
+
+    // We need to prime the first token
+    advance();
+
+    return SLANG_OK;
+}
+
+SLANG_FORCE_INLINE static const char* _handleEndOfLine(char c, const char* cursor)
+{
+    SLANG_ASSERT(c == '\n' || c == '\r');
+    const char d = *cursor;
+    return cursor + Index((c ^ d) == ('\n' ^ '\r'));
+}
+
+JSONTokenType JSONLexer::_setInvalidToken()
+{
+    return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+}
+
+JSONTokenType JSONLexer::advance()
+{
+    const char* cursor = m_cursor;
+
+    while (true)
+    {
+        m_lexemeStart = cursor;
+
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:     return _setToken(JSONTokenType::EndOfFile, cursor - 1);
+            case '"':
+            {
+                cursor = _lexString(cursor);
+                if (cursor == nullptr)
+                {
+                    return _setInvalidToken();
+                }
+                return _setToken(JSONTokenType::StringLiteral, cursor);
+            }
+            case '/':
+            {
+                // We allow comments
+                const char nextChar = *m_cursor;
+
+                if (nextChar == '/')
+                {
+                    // Line comment
+                    cursor = _lexLineComment(cursor);
+                    break;
+                }
+                else if (nextChar == '*')
+                {
+                    cursor = _lexBlockComment(cursor);
+                    // Can fail... 
+                    if (cursor == nullptr)
+                    {
+                        return _setInvalidToken();
+                    }
+                    break;
+                }
+            }
+            case ' ':
+            case '\t':
+            case '\n':
+            case '\r':
+            {
+                cursor = _lexWhitespace(cursor);
+                break;
+            }
+            case ':':           return _setToken(JSONTokenType::Colon, cursor);
+            case ',':           return _setToken(JSONTokenType::Comma, cursor);
+            case '[':           return _setToken(JSONTokenType::LBracket, cursor);
+            case ']':           return _setToken(JSONTokenType::RBracket, cursor);
+            case '{':           return _setToken(JSONTokenType::LBrace, cursor);
+            case '}':           return _setToken(JSONTokenType::RBrace, cursor);
+
+            case '-':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                LexResult res = _lexNumber(cursor - 1);
+                if (res.cursor == nullptr)
+                {
+                    return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+                }
+                return _setToken(res.type, res.cursor);
+            }
+            case 't':
+            {
+                if (cursor[0] == 'r' && cursor[1] == 'u' && cursor[2] == 'e')
+                {
+                    return _setToken(JSONTokenType::True, cursor + 3);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            case 'f':
+            {
+                if (cursor[0] == 'a' && cursor[1] == 'l' && cursor[2] == 's' && cursor[3] == 'e')
+                {
+                    return _setToken(JSONTokenType::False, cursor + 4);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            case 'n':
+            {
+                if (cursor[0] == 'u' && cursor[1] == 'l' && cursor[2] == 'l')
+                {
+                    return _setToken(JSONTokenType::Null, cursor + 3);
+                }
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+                return _setInvalidToken();
+            }
+            default:
+            {
+                StringBuilder buf;
+                if (c <= ' ' || c >= 0x7e)
+                {
+                    static const char s_hex[] = "012345679abcdef";
+
+                    char hexBuf[5] = "0x";
+      
+                    uint32_t value = c;
+                    hexBuf[2] = s_hex[((value >> 4) & 0xf)];
+                    hexBuf[3] = s_hex[(value & 0xf)];
+                    hexBuf[4] = 0;
+
+                    buf << hexBuf;
+                }
+                else
+                {
+                    buf << c;
+                }
+
+                m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::unexpectedCharacter);
+                return _setInvalidToken();
+            }
+        }
+    }
+}
+
+JSONLexer::LexResult JSONLexer::_lexNumber(const char* cursor)
+{
+    JSONTokenType tokenType = JSONTokenType::IntegerLiteral;
+
+    if (*cursor == '-')
+    {
+        cursor++;
+    }
+
+    if (*cursor == '0')
+    {
+        // Can only be followed by . exponent, or nothing
+        cursor++;
+    }
+    else if (*cursor >= '1' && *cursor <= '9')
+    {
+        cursor++;
+        while (CharUtil::isDigit(*cursor))
+        {
+            cursor++;
+        }
+    }
+
+    // Theres a fraction
+    if (*cursor == '.')
+    {
+        tokenType = JSONTokenType::FloatLiteral;
+        // Skip the dot
+        cursor++;
+        // Must have at least one digit
+        if (!CharUtil::isDigit(*cursor))
+        {
+            m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+            return LexResult{ JSONTokenType::Invalid, nullptr };
+        }
+        // Skip the digit
+        cursor++;
+        // Skip any more digits
+        while (CharUtil::isDigit(*cursor)) cursor++;
+    }
+
+    // Theres an exponent
+    if (*cursor == 'e' || *cursor == 'E')
+    {
+        tokenType = JSONTokenType::FloatLiteral;
+
+        // Has an exponent
+        cursor++;
+
+        // Skip +/- if has one
+        if (*cursor == '+' || *cursor == '-')
+        {
+            cursor++;
+        }
+
+        // Must have one digit
+        if (!CharUtil::isDigit(*cursor))
+        {
+            m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+            return LexResult{ JSONTokenType::Invalid, nullptr };
+        }
+
+        // Skip the digit
+        cursor++;
+        // Skip any more digits
+        while (CharUtil::isDigit(*cursor)) cursor++;
+    }
+
+    return LexResult{tokenType, cursor};
+}
+
+const char* JSONLexer::_lexString(const char* cursor)
+{
+    // We've skipped the first "
+    while (true)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:
+            {
+                m_sink->diagnose(_getLoc(cursor - 1), JSONDiagnostics::endOfFileInLiteral);
+                return nullptr;
+            }
+            case '"':
+            {
+                return cursor;
+            }
+            case '\\':
+            {
+                const char nextC = *cursor;
+                switch (nextC)
+                {
+                    case '"':
+                    case '\\':
+                    case '/':
+                    case 'b':
+                    case 'f':
+                    case 'n':
+                    case 'r':
+                    case 't':
+                    {
+                        ++cursor;
+                        break;
+                    }
+                    case 'u':
+                    {
+                        cursor++;
+                        for (Index i = 0; i < 4; ++i)
+                        {
+                            if (!CharUtil::isHexDigit(cursor[i]))
+                            {
+                                m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingAHexDigit);
+                                return nullptr;
+                            }
+                        }
+                        cursor += 4;
+                        break;
+                    }
+                }
+
+            }
+            // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+            default: break;
+        }
+    }
+}
+
+const char* JSONLexer::_lexLineComment(const char* cursor)
+{
+    for (;;)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case '\n':
+            case '\r':
+            {
+                // We need to skip to the next line
+                return _handleEndOfLine(c, cursor);
+            }
+            case 0:
+            {
+                return cursor - 1;
+            }
+        }
+    }
+}
+
+const char* JSONLexer::_lexBlockComment(const char* cursor)
+{
+    for (;;)
+    {
+        const char c = *cursor++;
+        switch (c)
+        {
+            case 0:
+            {
+                m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::endOfFileInComment);
+                return nullptr;
+            }
+            case '*':
+            {
+                if (*cursor == '/')
+                {
+                    return cursor + 1;
+                }
+                break;
+            }
+            default: break;
+        }
+    }
+}
+
+const char* JSONLexer::_lexWhitespace(const char* cursor)
+{
+    while (true)
+    {
+        const char c = *cursor;
+
+        // Might want to use CharUtil::isWhitespace...
+
+        switch (c)
+        {
+            case ' ':
+            case '\n':
+            case '\r':
+            case '\t':
+            {
+                cursor++;
+                break;
+            }
+            default:
+            {
+                // Hit non white space
+                return cursor;
+            }
+        }
+
+    }
+}
+
+} // namespace Slang
diff --git a/source/compiler-core/slang-json-lexer.h b/source/compiler-core/slang-json-lexer.h
new file mode 100644
index 000000000..03f16d445
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.h
@@ -0,0 +1,89 @@
+// slang-json-lexer.h
+#ifndef SLANG_JSON_LEXER_H
+#define SLANG_JSON_LEXER_H
+
+#include "../core/slang-basic.h"
+
+#include "slang-source-loc.h"
+#include "slang-diagnostic-sink.h"
+
+namespace Slang {
+
+enum class JSONTokenType
+{
+    Invalid,
+    IntegerLiteral,
+    FloatLiteral,
+    StringLiteral,
+    LBracket,
+    RBracket,
+    LBrace,
+    RBrace,
+    Comma,
+    Colon,
+    True,
+    False,
+    Null,
+    EndOfFile,
+    CountOf,
+};
+
+struct JSONToken
+{
+    JSONTokenType type;         ///< The token type 
+    SourceLoc loc;              ///< Location in the source file
+    uint32_t length;            ///< The length of the token in bytes
+};
+
+class JSONLexer
+{
+public:
+    JSONToken& peekToken() { return m_token; }
+    JSONTokenType peekType() { return m_token.type; }
+
+    JSONTokenType advance();
+
+    SlangResult init(SourceView* sourceView, DiagnosticSink* sink);
+
+protected:
+    struct LexResult
+    {
+        JSONTokenType type;
+        const char* cursor;
+    };
+
+        /// Get the location of the cursor
+    SLANG_FORCE_INLINE SourceLoc _getLoc(const char* cursor) const { return m_startLoc + (cursor - m_contentStart); }
+    const char* _lexLineComment(const char* cursor);
+    const char* _lexBlockComment(const char* cursor);
+    const char* _lexWhitespace(const char* cursor);
+    const char* _lexString(const char* cursor);
+    LexResult _lexNumber(const char* cursor);
+
+    SLANG_FORCE_INLINE JSONTokenType _setToken(JSONTokenType type, const char* cursor)
+    {
+        SLANG_ASSERT(cursor >= m_lexemeStart);
+        m_token.type = type;
+        m_token.loc = m_startLoc + (m_lexemeStart - m_contentStart);
+        m_token.length = uint32_t(cursor - m_lexemeStart);
+        m_cursor = cursor;
+        return type;
+    }
+    JSONTokenType _setInvalidToken();
+
+    JSONToken m_token;
+
+    const char* m_cursor;
+    const char* m_lexemeStart;
+
+    const char* m_contentStart;
+
+    SourceLoc m_startLoc;
+
+    SourceView* m_sourceView;
+    DiagnosticSink* m_sink;
+};
+
+} // namespace Slang
+
+#endif
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 5e4db269c..a91d88e05 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -87,6 +87,8 @@ SlangResult SpaceStringEscapeHandler::appendEscaped(const UnownedStringSlice& sl
     }
 }
 
+
+
 // !!!!!!!!!!!!!!!!!!!!!!!!!! CppStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 class CppStringEscapeHandler : public StringEscapeHandler
@@ -445,10 +447,380 @@ SlangResult CppStringEscapeHandler::lexQuoted(const char* cursor, const char** o
     }
 }
 
+// !!!!!!!!!!!!!!!!!!!!!!!!!! JSONStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+class JSONStringEscapeHandler : public StringEscapeHandler
+{
+public:
+    typedef StringEscapeHandler Super;
+
+    virtual bool isQuotingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE { SLANG_UNUSED(slice); return true; }
+    virtual bool isEscapingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE;
+    virtual SlangResult appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE;
+    virtual SlangResult appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE;
+    virtual SlangResult lexQuoted(const char* cursor, const char** outCursor) SLANG_OVERRIDE;
+
+    JSONStringEscapeHandler() : Super('"') {}
+};
+
+bool JSONStringEscapeHandler::isEscapingNeeded(const UnownedStringSlice& slice)
+{
+    const char* cur = slice.begin();
+    const char*const end = slice.end();
+
+    for (; cur < end; ++cur)
+    {
+        const char c = *cur;
+
+        switch (c)
+        {
+            case '\"':
+            case '\\':
+            case '/':
+            {
+                return true;
+            }
+            default:
+            {
+                if (c < ' ' || c >= 0x7e)
+                {
+                    return true;
+                }
+                break;
+            }
+        }
+    }
+    return false;
+}
+
+SlangResult JSONStringEscapeHandler::lexQuoted(const char* cursor, const char** outCursor)
+{
+    // We've skipped the first "
+    while (true)
+    {
+        const char c = *cursor++;
+
+        switch (c)
+        {
+            case 0:     return SLANG_FAIL;
+            case '"':
+            {
+                *outCursor = cursor;
+                return SLANG_OK;
+            }
+            case '\\':
+            {
+                const char nextC = *cursor;
+                switch (nextC)
+                {
+                    case '"':
+                    case '\\':
+                    case '/':
+                    case 'b':
+                    case 'f':
+                    case 'n':
+                    case 'r':
+                    case 't':
+                    {
+                        ++cursor;
+                        break;
+                    }
+                    case 'u':
+                    {
+                        cursor++;
+                        for (Index i = 0; i < 4; ++i)
+                        {
+                            if (!CharUtil::isHexDigit(cursor[i]))
+                            {
+                                return SLANG_FAIL;
+                            }
+                        }
+                        cursor += 4;
+                        break;
+                    }
+                }
+            }
+            // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+            default: break;
+        }
+    }
+}
+
+static char _getJSONEscapedChar(char c)
+{
+    switch (c)
+    {
+        case '\b':      return 'b';
+        case '\f':      return 'f';
+        case '\n':      return 'n';
+        case '\r':      return 'r';
+        case '\t':      return 't';
+        case '\\':      return '\\';
+        case '/':       return '/';
+        case '"':       return '"';
+        default:        return 0;
+    }
+}
+
+static char _getJSONUnescapedChar(char c)
+{
+    switch (c)
+    {
+        case 'b':      return '\b';
+        case 'f':      return '\f';
+        case 'n':      return '\n';
+        case 'r':      return '\r';
+        case 't':      return '\t';
+        case '\\':      return '\\';
+        case '/':       return '/';
+        case '"':       return '"';
+        default:        return 0;
+    }
+}
+
+static const char s_hex[] = "0123456789abcdef";
+
+// Outputs ioSlice with the chars remaining after utf8 encoded value
+// Returns ~uint32_t(0) if can't decode
+static uint32_t _getUnicodePointFromUTF8(UnownedStringSlice& ioSlice)
+{
+    const Index length = ioSlice.getLength();
+    SLANG_ASSERT(length > 0);
+    const char* cur = ioSlice.begin();
+
+    uint32_t codePoint = 0;
+    unsigned int leading = cur[0];
+    unsigned int mask = 0x80;
+
+    Index count = 0;
+    while (leading & mask)
+    {
+        count++;
+        mask >>= 1;
+    }
+
+    if (count > length)
+    {
+        SLANG_ASSERT(!"Can't decode");
+        ioSlice = UnownedStringSlice(ioSlice.end(), ioSlice.end());
+        return ~uint32_t(0);
+    }
+
+    codePoint = (leading & (mask - 1));
+    for (Index i = 1; i <= count - 1; i++)
+    {
+        codePoint <<= 6;
+        codePoint += (cur[i] & 0x3F);
+    }
+
+    ioSlice = UnownedStringSlice(cur + count, ioSlice.end());
+    return codePoint;
+}
+
+static void _appendHex16(uint32_t value, StringBuilder& out)
+{
+    // Let's go with hex
+    char buf[] = "\\u0000";
+
+    buf[2] = s_hex[(value >> 12) & 0xf];
+    buf[3] = s_hex[(value >> 8) & 0xf];
+    buf[4] = s_hex[(value >> 4) & 0xf];
+    buf[5] = s_hex[(value >> 0) & 0xf];
+
+    out.append(UnownedStringSlice(buf, 6));
+}
+
+SlangResult JSONStringEscapeHandler::appendEscaped(const UnownedStringSlice& slice, StringBuilder& out)
+{
+    const char* start = slice.begin();
+    const char* cur = start;
+    const char*const end = slice.end();
+
+    for (; cur < end; ++cur)
+    {
+        const char c = *cur;
+        
+        const char escapedChar = _getJSONEscapedChar(c);
+
+        if (escapedChar)
+        {
+            // Flush
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+            out.appendChar('\\');
+            out.appendChar(escapedChar);
+
+            start = cur + 1;
+        }
+        else if (uint8_t(c) & 0x80)
+        {
+            // Flush
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+
+            // UTF8
+            UnownedStringSlice remainingSlice(cur, end);
+            uint32_t codePoint = _getUnicodePointFromUTF8(remainingSlice);
+
+            // We only support up to 16 bit unicode values for now...
+            SLANG_ASSERT(codePoint < 0x10000);
+
+            _appendHex16(codePoint, out);
+
+            cur = remainingSlice.begin() - 1;
+            start = cur + 1;
+        }
+        else if (uint8_t(c) < ' ' || (c >= 0x7e))
+        {
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+
+            _appendHex16(uint32_t(c), out);
+
+            start = cur + 1;
+        }
+        else
+        {
+            // Can go out as it is
+        }
+    }
+
+    // Flush at the end
+    if (start < end)
+    {
+        out.append(start, end);
+    }
+    return SLANG_OK;
+}
+
+SlangResult JSONStringEscapeHandler::appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out)
+{
+    const char* start = slice.begin();
+    const char* cur = start;
+    const char*const end = slice.end();
+
+    for (; cur < end; ++cur)
+    {
+        const char c = *cur;
+
+        if (c == '\\')
+        {
+            // Flush
+            if (start < cur)
+            {
+                out.append(start, cur);
+            }
+
+            /// Next 
+            cur++;
+
+            if (cur >= end)
+            {
+                return SLANG_FAIL;
+            }
+
+            // Need to handle various escape sequence cases
+            switch (*cur)
+            {
+                case '\"':
+                case '\\':
+                case '/':
+                case 'b':
+                case 'f':
+                case 'n':
+                case 'r':
+                case 't':
+                {
+                    const char unescapedChar = _getJSONUnescapedChar(*cur);
+                    if (unescapedChar == 0)
+                    {
+                        // Don't know how to unescape that char
+                        return SLANG_FAIL;
+                    }
+                    out.appendChar(unescapedChar);
+
+                    start = cur + 1;
+                    break;
+                }
+                case 'u':
+                {
+                    uint32_t value = 0;
+                    cur++;
+
+                    if (cur + 4 > end)
+                    {
+                        return SLANG_FAIL;
+                    }
+
+                    for (Index i = 0; i < 4; ++i)
+                    {
+                        const char digitC = cur[i];
+
+                        uint32_t digitValue;
+                        if (digitC >= '0' && digitC <= '9')
+                        {
+                            digitValue = digitC - '0';
+                        }
+                        else if (digitC >= 'a' && digitC <= 'f')
+                        {
+                            digitValue = digitC -'a' + 10;
+                        }
+                        else if(digitC >= 'A' && digitC <= 'F')
+                        {
+                            digitValue = digitC - 'A' + 10;
+                        }
+                        else
+                        {
+                            return SLANG_FAIL;
+                        }
+                        SLANG_ASSERT(digitValue < 0x10);
+                        value = (value << 4) | digitValue;
+                    }
+                    cur += 4;
+
+                    // NOTE! Strictly speaking we may want to combine 2 UTF16 surrogates to make a single
+                    // UTF8 encoded char.
+                    
+                    // Need to encode in UTF8 to concat
+
+                    char buf[8];
+                    int len = EncodeUnicodePointToUTF8(buf, value);
+
+                    out.append(buf, buf + len);
+
+                    start = cur;
+                    cur--;
+                    break;
+                }
+                default:
+                {
+                    // Can't decode
+                    return SLANG_FAIL;
+                }
+            }
+        }
+    }
+
+    // Flush
+    if (start < end)
+    {
+        out.append(start, end);
+    }
+
+    return SLANG_OK;
+}
+
 // !!!!!!!!!!!!!!!!!!!!!!!!!! StringEscapeUtil !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 static CppStringEscapeHandler g_cppHandler;
 static SpaceStringEscapeHandler g_spaceHandler;
+static JSONStringEscapeHandler g_jsonHandler;
 
 StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
 {
@@ -456,6 +828,7 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
     {
         case Style::Cpp:    return &g_cppHandler;
         case Style::Space:  return &g_spaceHandler;
+        case Style::JSON:   return &g_jsonHandler;
         default:            return nullptr;
     }
 }
diff --git a/source/core/slang-string-escape-util.h b/source/core/slang-string-escape-util.h
index 9dc653df3..c3039eb47 100644
--- a/source/core/slang-string-escape-util.h
+++ b/source/core/slang-string-escape-util.h
@@ -51,6 +51,7 @@ struct StringEscapeUtil
     {
         Cpp,            ///< Cpp style quoting and escape handling
         Space,          ///< Applies quotes if there are spaces. Does not escape.
+        JSON,           ///< Json encoding
     };
 
         /// Given a style returns a handler
author	jsmall-nvidia <jsmall@nvidia.com>	2021-05-25 20:58:43 -0400
committer	GitHub <noreply@github.com>	2021-05-25 20:58:43 -0400
commit	7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree	6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source
parent	89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)