diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2021-05-25 20:58:43 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-05-25 20:58:43 -0400 |
| commit | 7d1b8ac13faf80ed56b37243480d097059da5aab (patch) | |
| tree | 6613b13983083d16b8945c6d92b1f4f1d1fb2501 | |
| parent | 89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff) | |
JSON Lexing and string encoding/decoding (#1858)
* #include an absolute path didn't work - because paths were taken to always be relative.
* WIP Json lexer.
* Check JSON Lex with unit test
* Add JSON escaping/unescaping of strings.
* Big fix encoding/decoding.
* Fix typo in JSON diagnostics.
* Fix typo.
* Better float testing.
| -rw-r--r-- | build/visual-studio/compiler-core/compiler-core.vcxproj | 5 | ||||
| -rw-r--r-- | build/visual-studio/compiler-core/compiler-core.vcxproj.filters | 15 | ||||
| -rw-r--r-- | build/visual-studio/slang-test/slang-test.vcxproj | 1 | ||||
| -rw-r--r-- | build/visual-studio/slang-test/slang-test.vcxproj.filters | 3 | ||||
| -rw-r--r-- | source/compiler-core/slang-json-diagnostic-defs.h | 37 | ||||
| -rw-r--r-- | source/compiler-core/slang-json-diagnostics.cpp | 33 | ||||
| -rw-r--r-- | source/compiler-core/slang-json-diagnostics.h | 26 | ||||
| -rw-r--r-- | source/compiler-core/slang-json-lexer.cpp | 385 | ||||
| -rw-r--r-- | source/compiler-core/slang-json-lexer.h | 89 | ||||
| -rw-r--r-- | source/core/slang-string-escape-util.cpp | 373 | ||||
| -rw-r--r-- | source/core/slang-string-escape-util.h | 1 | ||||
| -rw-r--r-- | tools/slang-test/unit-test-json.cpp | 180 |
12 files changed, 1148 insertions, 0 deletions
diff --git a/build/visual-studio/compiler-core/compiler-core.vcxproj b/build/visual-studio/compiler-core/compiler-core.vcxproj index e6883900c..7d2ac6afa 100644 --- a/build/visual-studio/compiler-core/compiler-core.vcxproj +++ b/build/visual-studio/compiler-core/compiler-core.vcxproj @@ -179,6 +179,9 @@ <ClInclude Include="..\..\..\source\compiler-core\slang-gcc-compiler-util.h" /> <ClInclude Include="..\..\..\source\compiler-core\slang-glslang-compiler.h" /> <ClInclude Include="..\..\..\source\compiler-core\slang-include-system.h" /> + <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostic-defs.h" /> + <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostics.h" /> + <ClInclude Include="..\..\..\source\compiler-core\slang-json-lexer.h" /> <ClInclude Include="..\..\..\source\compiler-core\slang-lexer-diagnostic-defs.h" /> <ClInclude Include="..\..\..\source\compiler-core\slang-lexer.h" /> <ClInclude Include="..\..\..\source\compiler-core\slang-misc-diagnostic-defs.h" /> @@ -201,6 +204,8 @@ <ClCompile Include="..\..\..\source\compiler-core\slang-gcc-compiler-util.cpp" /> <ClCompile Include="..\..\..\source\compiler-core\slang-glslang-compiler.cpp" /> <ClCompile Include="..\..\..\source\compiler-core\slang-include-system.cpp" /> + <ClCompile Include="..\..\..\source\compiler-core\slang-json-diagnostics.cpp" /> + <ClCompile Include="..\..\..\source\compiler-core\slang-json-lexer.cpp" /> <ClCompile Include="..\..\..\source\compiler-core\slang-lexer.cpp" /> <ClCompile Include="..\..\..\source\compiler-core\slang-name-convention-util.cpp" /> <ClCompile Include="..\..\..\source\compiler-core\slang-name.cpp" /> diff --git a/build/visual-studio/compiler-core/compiler-core.vcxproj.filters b/build/visual-studio/compiler-core/compiler-core.vcxproj.filters index d35f9941c..243eecd5b 100644 --- a/build/visual-studio/compiler-core/compiler-core.vcxproj.filters +++ b/build/visual-studio/compiler-core/compiler-core.vcxproj.filters @@ -36,6 +36,15 @@ <ClInclude Include="..\..\..\source\compiler-core\slang-include-system.h"> <Filter>Header Files</Filter> </ClInclude> + <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostic-defs.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\..\source\compiler-core\slang-json-diagnostics.h"> + <Filter>Header Files</Filter> + </ClInclude> + <ClInclude Include="..\..\..\source\compiler-core\slang-json-lexer.h"> + <Filter>Header Files</Filter> + </ClInclude> <ClInclude Include="..\..\..\source\compiler-core\slang-lexer-diagnostic-defs.h"> <Filter>Header Files</Filter> </ClInclude> @@ -98,6 +107,12 @@ <ClCompile Include="..\..\..\source\compiler-core\slang-include-system.cpp"> <Filter>Source Files</Filter> </ClCompile> + <ClCompile Include="..\..\..\source\compiler-core\slang-json-diagnostics.cpp"> + <Filter>Source Files</Filter> + </ClCompile> + <ClCompile Include="..\..\..\source\compiler-core\slang-json-lexer.cpp"> + <Filter>Source Files</Filter> + </ClCompile> <ClCompile Include="..\..\..\source\compiler-core\slang-lexer.cpp"> <Filter>Source Files</Filter> </ClCompile> diff --git a/build/visual-studio/slang-test/slang-test.vcxproj b/build/visual-studio/slang-test/slang-test.vcxproj index 24d2f2ac1..22a8aae5f 100644 --- a/build/visual-studio/slang-test/slang-test.vcxproj +++ b/build/visual-studio/slang-test/slang-test.vcxproj @@ -183,6 +183,7 @@ <ClCompile Include="..\..\..\tools\slang-test\unit-test-compression.cpp" /> <ClCompile Include="..\..\..\tools\slang-test\unit-test-find-type-by-name.cpp" /> <ClCompile Include="..\..\..\tools\slang-test\unit-test-free-list.cpp" /> + <ClCompile Include="..\..\..\tools\slang-test\unit-test-json.cpp" /> <ClCompile Include="..\..\..\tools\slang-test\unit-test-memory-arena.cpp" /> <ClCompile Include="..\..\..\tools\slang-test\unit-test-path.cpp" /> <ClCompile Include="..\..\..\tools\slang-test\unit-test-riff.cpp" /> diff --git a/build/visual-studio/slang-test/slang-test.vcxproj.filters b/build/visual-studio/slang-test/slang-test.vcxproj.filters index 9b7138eef..1e5b6e4af 100644 --- a/build/visual-studio/slang-test/slang-test.vcxproj.filters +++ b/build/visual-studio/slang-test/slang-test.vcxproj.filters @@ -68,6 +68,9 @@ <ClCompile Include="..\..\..\tools\slang-test\unit-test-free-list.cpp"> <Filter>Source Files</Filter> </ClCompile> + <ClCompile Include="..\..\..\tools\slang-test\unit-test-json.cpp"> + <Filter>Source Files</Filter> + </ClCompile> <ClCompile Include="..\..\..\tools\slang-test\unit-test-memory-arena.cpp"> <Filter>Source Files</Filter> </ClCompile> diff --git a/source/compiler-core/slang-json-diagnostic-defs.h b/source/compiler-core/slang-json-diagnostic-defs.h new file mode 100644 index 000000000..a4b260857 --- /dev/null +++ b/source/compiler-core/slang-json-diagnostic-defs.h @@ -0,0 +1,37 @@ +// + +// The file is meant to be included multiple times, to produce different +// pieces of declaration/definition code related to diagnostic messages +// +// Each diagnostic is declared here with: +// +// DIAGNOSTIC(id, severity, name, messageFormat) +// +// Where `id` is the unique diagnostic ID, `severity` is the default +// severity (from the `Severity` enum), `name` is a name used to refer +// to this diagnostic from code, and `messageFormat` is the default +// (non-localized) message for the diagnostic, with placeholders +// for any arguments. + +#ifndef DIAGNOSTIC +#error Need to #define DIAGNOSTIC(...) before including +#define DIAGNOSTIC(id, severity, name, messageFormat) /* */ +#endif + +// +// -1 - Notes that decorate another diagnostic. +// + +// +// 2xxxx - JSON Lexical analysis +// + +DIAGNOSTIC(20000, Error, unexpectedCharacter, "unexpected character '$0'") +DIAGNOSTIC(20001, Error, endOfFileInLiteral, "end of file in literal") +DIAGNOSTIC(20002, Error, newlineInLiteral, "newline in literal") +DIAGNOSTIC(20003, Error, endOfFileInComment, "end of file in comment") +DIAGNOSTIC(20004, Error, expectingAHexDigit, "expecting a hex digit") +DIAGNOSTIC(20005, Error, expectingADigit, "expecting a digit") +DIAGNOSTIC(20006, Error, expectingValueName, "expecting value name [null, true, false]") + +#undef DIAGNOSTIC diff --git a/source/compiler-core/slang-json-diagnostics.cpp b/source/compiler-core/slang-json-diagnostics.cpp new file mode 100644 index 000000000..1d35e8faf --- /dev/null +++ b/source/compiler-core/slang-json-diagnostics.cpp @@ -0,0 +1,33 @@ +// slang-json-diagnostics.cpp +#include "slang-json-diagnostics.h" + +namespace Slang { + +namespace JSONDiagnostics +{ +#define DIAGNOSTIC(id, severity, name, messageFormat) const DiagnosticInfo name = { id, Severity::severity, #name, messageFormat }; +#include "slang-json-diagnostic-defs.h" +#undef DIAGNOSTIC +} + +static const DiagnosticInfo* const kJSONDiagnostics[] = +{ +#define DIAGNOSTIC(id, severity, name, messageFormat) &JSONDiagnostics::name, +#include "slang-json-diagnostic-defs.h" +#undef DIAGNOSTIC +}; + +static DiagnosticsLookup* _newJSONDiagnosticsLookup() +{ + auto lookup = new DiagnosticsLookup; + lookup->add(kJSONDiagnostics, SLANG_COUNT_OF(kJSONDiagnostics)); + return lookup; +} + +DiagnosticsLookup* getJSONDiagnosticsLookup() +{ + static RefPtr<DiagnosticsLookup> s_lookup = _newJSONDiagnosticsLookup(); + return s_lookup; +} + +} // namespace Slang diff --git a/source/compiler-core/slang-json-diagnostics.h b/source/compiler-core/slang-json-diagnostics.h new file mode 100644 index 000000000..88ec0c550 --- /dev/null +++ b/source/compiler-core/slang-json-diagnostics.h @@ -0,0 +1,26 @@ +#ifndef SLANG_JSON_DIAGNOSTICS_H +#define SLANG_JSON_DIAGNOSTICS_H + +#include "../core/slang-basic.h" +#include "../core/slang-writer.h" + +#include "slang-source-loc.h" +#include "slang-diagnostic-sink.h" +#include "slang-token.h" + +#include "../../slang.h" + +namespace Slang +{ + +DiagnosticsLookup* getJSONDiagnosticsLookup(); + +namespace JSONDiagnostics +{ +#define DIAGNOSTIC(id, severity, name, messageFormat) extern const DiagnosticInfo name; +#include "slang-json-diagnostic-defs.h" +} + +} + +#endif diff --git a/source/compiler-core/slang-json-lexer.cpp b/source/compiler-core/slang-json-lexer.cpp new file mode 100644 index 000000000..19a5b29a7 --- /dev/null +++ b/source/compiler-core/slang-json-lexer.cpp @@ -0,0 +1,385 @@ +// slang-json-lexer.cpp +#include "slang-json-lexer.h" + +#include "slang-json-diagnostics.h" +#include "../core/slang-char-util.h" + +/* +https://www.json.org/json-en.html +*/ + +namespace Slang { + +SlangResult JSONLexer::init(SourceView* sourceView, DiagnosticSink* sink) +{ + m_sourceView = sourceView; + m_sink = sink; + + SourceFile* sourceFile = sourceView->getSourceFile(); + + // Note that the content must be null terminated (because of other requirements) + SLANG_ASSERT(sourceFile && sourceFile->hasContent()); + + m_contentStart = sourceFile->getContent().begin(); + + m_startLoc = sourceView->getRange().begin; + + m_lexemeStart = m_contentStart; + m_cursor = m_lexemeStart; + + // We need to prime the first token + advance(); + + return SLANG_OK; +} + +SLANG_FORCE_INLINE static const char* _handleEndOfLine(char c, const char* cursor) +{ + SLANG_ASSERT(c == '\n' || c == '\r'); + const char d = *cursor; + return cursor + Index((c ^ d) == ('\n' ^ '\r')); +} + +JSONTokenType JSONLexer::_setInvalidToken() +{ + return _setToken(JSONTokenType::Invalid, m_lexemeStart); +} + +JSONTokenType JSONLexer::advance() +{ + const char* cursor = m_cursor; + + while (true) + { + m_lexemeStart = cursor; + + const char c = *cursor++; + + switch (c) + { + case 0: return _setToken(JSONTokenType::EndOfFile, cursor - 1); + case '"': + { + cursor = _lexString(cursor); + if (cursor == nullptr) + { + return _setInvalidToken(); + } + return _setToken(JSONTokenType::StringLiteral, cursor); + } + case '/': + { + // We allow comments + const char nextChar = *m_cursor; + + if (nextChar == '/') + { + // Line comment + cursor = _lexLineComment(cursor); + break; + } + else if (nextChar == '*') + { + cursor = _lexBlockComment(cursor); + // Can fail... + if (cursor == nullptr) + { + return _setInvalidToken(); + } + break; + } + } + case ' ': + case '\t': + case '\n': + case '\r': + { + cursor = _lexWhitespace(cursor); + break; + } + case ':': return _setToken(JSONTokenType::Colon, cursor); + case ',': return _setToken(JSONTokenType::Comma, cursor); + case '[': return _setToken(JSONTokenType::LBracket, cursor); + case ']': return _setToken(JSONTokenType::RBracket, cursor); + case '{': return _setToken(JSONTokenType::LBrace, cursor); + case '}': return _setToken(JSONTokenType::RBrace, cursor); + + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + LexResult res = _lexNumber(cursor - 1); + if (res.cursor == nullptr) + { + return _setToken(JSONTokenType::Invalid, m_lexemeStart); + } + return _setToken(res.type, res.cursor); + } + case 't': + { + if (cursor[0] == 'r' && cursor[1] == 'u' && cursor[2] == 'e') + { + return _setToken(JSONTokenType::True, cursor + 3); + } + m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName); + return _setInvalidToken(); + } + case 'f': + { + if (cursor[0] == 'a' && cursor[1] == 'l' && cursor[2] == 's' && cursor[3] == 'e') + { + return _setToken(JSONTokenType::False, cursor + 4); + } + m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName); + return _setInvalidToken(); + } + case 'n': + { + if (cursor[0] == 'u' && cursor[1] == 'l' && cursor[2] == 'l') + { + return _setToken(JSONTokenType::Null, cursor + 3); + } + m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName); + return _setInvalidToken(); + } + default: + { + StringBuilder buf; + if (c <= ' ' || c >= 0x7e) + { + static const char s_hex[] = "012345679abcdef"; + + char hexBuf[5] = "0x"; + + uint32_t value = c; + hexBuf[2] = s_hex[((value >> 4) & 0xf)]; + hexBuf[3] = s_hex[(value & 0xf)]; + hexBuf[4] = 0; + + buf << hexBuf; + } + else + { + buf << c; + } + + m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::unexpectedCharacter); + return _setInvalidToken(); + } + } + } +} + +JSONLexer::LexResult JSONLexer::_lexNumber(const char* cursor) +{ + JSONTokenType tokenType = JSONTokenType::IntegerLiteral; + + if (*cursor == '-') + { + cursor++; + } + + if (*cursor == '0') + { + // Can only be followed by . exponent, or nothing + cursor++; + } + else if (*cursor >= '1' && *cursor <= '9') + { + cursor++; + while (CharUtil::isDigit(*cursor)) + { + cursor++; + } + } + + // Theres a fraction + if (*cursor == '.') + { + tokenType = JSONTokenType::FloatLiteral; + // Skip the dot + cursor++; + // Must have at least one digit + if (!CharUtil::isDigit(*cursor)) + { + m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit); + return LexResult{ JSONTokenType::Invalid, nullptr }; + } + // Skip the digit + cursor++; + // Skip any more digits + while (CharUtil::isDigit(*cursor)) cursor++; + } + + // Theres an exponent + if (*cursor == 'e' || *cursor == 'E') + { + tokenType = JSONTokenType::FloatLiteral; + + // Has an exponent + cursor++; + + // Skip +/- if has one + if (*cursor == '+' || *cursor == '-') + { + cursor++; + } + + // Must have one digit + if (!CharUtil::isDigit(*cursor)) + { + m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit); + return LexResult{ JSONTokenType::Invalid, nullptr }; + } + + // Skip the digit + cursor++; + // Skip any more digits + while (CharUtil::isDigit(*cursor)) cursor++; + } + + return LexResult{tokenType, cursor}; +} + +const char* JSONLexer::_lexString(const char* cursor) +{ + // We've skipped the first " + while (true) + { + const char c = *cursor++; + + switch (c) + { + case 0: + { + m_sink->diagnose(_getLoc(cursor - 1), JSONDiagnostics::endOfFileInLiteral); + return nullptr; + } + case '"': + { + return cursor; + } + case '\\': + { + const char nextC = *cursor; + switch (nextC) + { + case '"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + { + ++cursor; + break; + } + case 'u': + { + cursor++; + for (Index i = 0; i < 4; ++i) + { + if (!CharUtil::isHexDigit(cursor[i])) + { + m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingAHexDigit); + return nullptr; + } + } + cursor += 4; + break; + } + } + + } + // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes. + default: break; + } + } +} + +const char* JSONLexer::_lexLineComment(const char* cursor) +{ + for (;;) + { + const char c = *cursor++; + + switch (c) + { + case '\n': + case '\r': + { + // We need to skip to the next line + return _handleEndOfLine(c, cursor); + } + case 0: + { + return cursor - 1; + } + } + } +} + +const char* JSONLexer::_lexBlockComment(const char* cursor) +{ + for (;;) + { + const char c = *cursor++; + switch (c) + { + case 0: + { + m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::endOfFileInComment); + return nullptr; + } + case '*': + { + if (*cursor == '/') + { + return cursor + 1; + } + break; + } + default: break; + } + } +} + +const char* JSONLexer::_lexWhitespace(const char* cursor) +{ + while (true) + { + const char c = *cursor; + + // Might want to use CharUtil::isWhitespace... + + switch (c) + { + case ' ': + case '\n': + case '\r': + case '\t': + { + cursor++; + break; + } + default: + { + // Hit non white space + return cursor; + } + } + + } +} + +} // namespace Slang diff --git a/source/compiler-core/slang-json-lexer.h b/source/compiler-core/slang-json-lexer.h new file mode 100644 index 000000000..03f16d445 --- /dev/null +++ b/source/compiler-core/slang-json-lexer.h @@ -0,0 +1,89 @@ +// slang-json-lexer.h +#ifndef SLANG_JSON_LEXER_H +#define SLANG_JSON_LEXER_H + +#include "../core/slang-basic.h" + +#include "slang-source-loc.h" +#include "slang-diagnostic-sink.h" + +namespace Slang { + +enum class JSONTokenType +{ + Invalid, + IntegerLiteral, + FloatLiteral, + StringLiteral, + LBracket, + RBracket, + LBrace, + RBrace, + Comma, + Colon, + True, + False, + Null, + EndOfFile, + CountOf, +}; + +struct JSONToken +{ + JSONTokenType type; ///< The token type + SourceLoc loc; ///< Location in the source file + uint32_t length; ///< The length of the token in bytes +}; + +class JSONLexer +{ +public: + JSONToken& peekToken() { return m_token; } + JSONTokenType peekType() { return m_token.type; } + + JSONTokenType advance(); + + SlangResult init(SourceView* sourceView, DiagnosticSink* sink); + +protected: + struct LexResult + { + JSONTokenType type; + const char* cursor; + }; + + /// Get the location of the cursor + SLANG_FORCE_INLINE SourceLoc _getLoc(const char* cursor) const { return m_startLoc + (cursor - m_contentStart); } + const char* _lexLineComment(const char* cursor); + const char* _lexBlockComment(const char* cursor); + const char* _lexWhitespace(const char* cursor); + const char* _lexString(const char* cursor); + LexResult _lexNumber(const char* cursor); + + SLANG_FORCE_INLINE JSONTokenType _setToken(JSONTokenType type, const char* cursor) + { + SLANG_ASSERT(cursor >= m_lexemeStart); + m_token.type = type; + m_token.loc = m_startLoc + (m_lexemeStart - m_contentStart); + m_token.length = uint32_t(cursor - m_lexemeStart); + m_cursor = cursor; + return type; + } + JSONTokenType _setInvalidToken(); + + JSONToken m_token; + + const char* m_cursor; + const char* m_lexemeStart; + + const char* m_contentStart; + + SourceLoc m_startLoc; + + SourceView* m_sourceView; + DiagnosticSink* m_sink; +}; + +} // namespace Slang + +#endif diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp index 5e4db269c..a91d88e05 100644 --- a/source/core/slang-string-escape-util.cpp +++ b/source/core/slang-string-escape-util.cpp @@ -87,6 +87,8 @@ SlangResult SpaceStringEscapeHandler::appendEscaped(const UnownedStringSlice& sl } } + + // !!!!!!!!!!!!!!!!!!!!!!!!!! CppStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! class CppStringEscapeHandler : public StringEscapeHandler @@ -445,10 +447,380 @@ SlangResult CppStringEscapeHandler::lexQuoted(const char* cursor, const char** o } } +// !!!!!!!!!!!!!!!!!!!!!!!!!! JSONStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +class JSONStringEscapeHandler : public StringEscapeHandler +{ +public: + typedef StringEscapeHandler Super; + + virtual bool isQuotingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE { SLANG_UNUSED(slice); return true; } + virtual bool isEscapingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE; + virtual SlangResult appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE; + virtual SlangResult appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE; + virtual SlangResult lexQuoted(const char* cursor, const char** outCursor) SLANG_OVERRIDE; + + JSONStringEscapeHandler() : Super('"') {} +}; + +bool JSONStringEscapeHandler::isEscapingNeeded(const UnownedStringSlice& slice) +{ + const char* cur = slice.begin(); + const char*const end = slice.end(); + + for (; cur < end; ++cur) + { + const char c = *cur; + + switch (c) + { + case '\"': + case '\\': + case '/': + { + return true; + } + default: + { + if (c < ' ' || c >= 0x7e) + { + return true; + } + break; + } + } + } + return false; +} + +SlangResult JSONStringEscapeHandler::lexQuoted(const char* cursor, const char** outCursor) +{ + // We've skipped the first " + while (true) + { + const char c = *cursor++; + + switch (c) + { + case 0: return SLANG_FAIL; + case '"': + { + *outCursor = cursor; + return SLANG_OK; + } + case '\\': + { + const char nextC = *cursor; + switch (nextC) + { + case '"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + { + ++cursor; + break; + } + case 'u': + { + cursor++; + for (Index i = 0; i < 4; ++i) + { + if (!CharUtil::isHexDigit(cursor[i])) + { + return SLANG_FAIL; + } + } + cursor += 4; + break; + } + } + } + // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes. + default: break; + } + } +} + +static char _getJSONEscapedChar(char c) +{ + switch (c) + { + case '\b': return 'b'; + case '\f': return 'f'; + case '\n': return 'n'; + case '\r': return 'r'; + case '\t': return 't'; + case '\\': return '\\'; + case '/': return '/'; + case '"': return '"'; + default: return 0; + } +} + +static char _getJSONUnescapedChar(char c) +{ + switch (c) + { + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case '\\': return '\\'; + case '/': return '/'; + case '"': return '"'; + default: return 0; + } +} + +static const char s_hex[] = "0123456789abcdef"; + +// Outputs ioSlice with the chars remaining after utf8 encoded value +// Returns ~uint32_t(0) if can't decode +static uint32_t _getUnicodePointFromUTF8(UnownedStringSlice& ioSlice) +{ + const Index length = ioSlice.getLength(); + SLANG_ASSERT(length > 0); + const char* cur = ioSlice.begin(); + + uint32_t codePoint = 0; + unsigned int leading = cur[0]; + unsigned int mask = 0x80; + + Index count = 0; + while (leading & mask) + { + count++; + mask >>= 1; + } + + if (count > length) + { + SLANG_ASSERT(!"Can't decode"); + ioSlice = UnownedStringSlice(ioSlice.end(), ioSlice.end()); + return ~uint32_t(0); + } + + codePoint = (leading & (mask - 1)); + for (Index i = 1; i <= count - 1; i++) + { + codePoint <<= 6; + codePoint += (cur[i] & 0x3F); + } + + ioSlice = UnownedStringSlice(cur + count, ioSlice.end()); + return codePoint; +} + +static void _appendHex16(uint32_t value, StringBuilder& out) +{ + // Let's go with hex + char buf[] = "\\u0000"; + + buf[2] = s_hex[(value >> 12) & 0xf]; + buf[3] = s_hex[(value >> 8) & 0xf]; + buf[4] = s_hex[(value >> 4) & 0xf]; + buf[5] = s_hex[(value >> 0) & 0xf]; + + out.append(UnownedStringSlice(buf, 6)); +} + +SlangResult JSONStringEscapeHandler::appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) +{ + const char* start = slice.begin(); + const char* cur = start; + const char*const end = slice.end(); + + for (; cur < end; ++cur) + { + const char c = *cur; + + const char escapedChar = _getJSONEscapedChar(c); + + if (escapedChar) + { + // Flush + if (start < cur) + { + out.append(start, cur); + } + out.appendChar('\\'); + out.appendChar(escapedChar); + + start = cur + 1; + } + else if (uint8_t(c) & 0x80) + { + // Flush + if (start < cur) + { + out.append(start, cur); + } + + // UTF8 + UnownedStringSlice remainingSlice(cur, end); + uint32_t codePoint = _getUnicodePointFromUTF8(remainingSlice); + + // We only support up to 16 bit unicode values for now... + SLANG_ASSERT(codePoint < 0x10000); + + _appendHex16(codePoint, out); + + cur = remainingSlice.begin() - 1; + start = cur + 1; + } + else if (uint8_t(c) < ' ' || (c >= 0x7e)) + { + if (start < cur) + { + out.append(start, cur); + } + + _appendHex16(uint32_t(c), out); + + start = cur + 1; + } + else + { + // Can go out as it is + } + } + + // Flush at the end + if (start < end) + { + out.append(start, end); + } + return SLANG_OK; +} + +SlangResult JSONStringEscapeHandler::appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) +{ + const char* start = slice.begin(); + const char* cur = start; + const char*const end = slice.end(); + + for (; cur < end; ++cur) + { + const char c = *cur; + + if (c == '\\') + { + // Flush + if (start < cur) + { + out.append(start, cur); + } + + /// Next + cur++; + + if (cur >= end) + { + return SLANG_FAIL; + } + + // Need to handle various escape sequence cases + switch (*cur) + { + case '\"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + { + const char unescapedChar = _getJSONUnescapedChar(*cur); + if (unescapedChar == 0) + { + // Don't know how to unescape that char + return SLANG_FAIL; + } + out.appendChar(unescapedChar); + + start = cur + 1; + break; + } + case 'u': + { + uint32_t value = 0; + cur++; + + if (cur + 4 > end) + { + return SLANG_FAIL; + } + + for (Index i = 0; i < 4; ++i) + { + const char digitC = cur[i]; + + uint32_t digitValue; + if (digitC >= '0' && digitC <= '9') + { + digitValue = digitC - '0'; + } + else if (digitC >= 'a' && digitC <= 'f') + { + digitValue = digitC -'a' + 10; + } + else if(digitC >= 'A' && digitC <= 'F') + { + digitValue = digitC - 'A' + 10; + } + else + { + return SLANG_FAIL; + } + SLANG_ASSERT(digitValue < 0x10); + value = (value << 4) | digitValue; + } + cur += 4; + + // NOTE! Strictly speaking we may want to combine 2 UTF16 surrogates to make a single + // UTF8 encoded char. + + // Need to encode in UTF8 to concat + + char buf[8]; + int len = EncodeUnicodePointToUTF8(buf, value); + + out.append(buf, buf + len); + + start = cur; + cur--; + break; + } + default: + { + // Can't decode + return SLANG_FAIL; + } + } + } + } + + // Flush + if (start < end) + { + out.append(start, end); + } + + return SLANG_OK; +} + // !!!!!!!!!!!!!!!!!!!!!!!!!! StringEscapeUtil !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! static CppStringEscapeHandler g_cppHandler; static SpaceStringEscapeHandler g_spaceHandler; +static JSONStringEscapeHandler g_jsonHandler; StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style) { @@ -456,6 +828,7 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style) { case Style::Cpp: return &g_cppHandler; case Style::Space: return &g_spaceHandler; + case Style::JSON: return &g_jsonHandler; default: return nullptr; } } diff --git a/source/core/slang-string-escape-util.h b/source/core/slang-string-escape-util.h index 9dc653df3..c3039eb47 100644 --- a/source/core/slang-string-escape-util.h +++ b/source/core/slang-string-escape-util.h @@ -51,6 +51,7 @@ struct StringEscapeUtil { Cpp, ///< Cpp style quoting and escape handling Space, ///< Applies quotes if there are spaces. Does not escape. + JSON, ///< Json encoding }; /// Given a style returns a handler diff --git a/tools/slang-test/unit-test-json.cpp b/tools/slang-test/unit-test-json.cpp new file mode 100644 index 000000000..fff16b136 --- /dev/null +++ b/tools/slang-test/unit-test-json.cpp @@ -0,0 +1,180 @@ + +#include "../../source/compiler-core/slang-json-lexer.h" +#include "../../source/core/slang-string-escape-util.h" + +#include "test-context.h" + +using namespace Slang; + +namespace { // anonymous + +struct Element +{ + JSONTokenType type; + const char* value; +}; + +} // anonymous + +static SlangResult _lex(const char* in, DiagnosticSink* sink, List<JSONToken>& toks) +{ + SourceManager* sourceManager = sink->getSourceManager(); + + String contents(in); + SourceFile* sourceFile = sourceManager->createSourceFileWithString(PathInfo::makeUnknown(), contents); + SourceView* sourceView = sourceManager->createSourceView(sourceFile, nullptr, SourceLoc()); + + JSONLexer lexer; + + lexer.init(sourceView, sink); + + while (lexer.peekType() != JSONTokenType::EndOfFile) + { + if (lexer.peekType() == JSONTokenType::Invalid) + { + toks.add(lexer.peekToken()); + return SLANG_FAIL; + } + + toks.add(lexer.peekToken()); + lexer.advance(); + } + + toks.add(lexer.peekToken()); + + // If we advance from end of file we should still be at EndOfFile + SLANG_ASSERT(lexer.advance() == JSONTokenType::EndOfFile); + + return SLANG_OK; +} + +static bool _areEqual(SourceManager* sourceManager, const List<JSONToken>& toks, const Element* eles, Index elesCount) +{ + if (toks.getCount() != elesCount) + { + return false; + } + + SourceView* sourceView = toks.getCount() ? sourceManager->findSourceView(toks[0].loc) : nullptr; + const char*const content = sourceView ? sourceView->getContent().begin() : nullptr; + + for (Index i = 0; i < toks.getCount(); ++i) + { + const JSONToken& tok = toks[i]; + const auto& ele = eles[i]; + + if (tok.type != ele.type) + { + return false; + } + + SLANG_ASSERT(sourceView->getRange().contains(tok.loc)); + + const char* start = content + sourceView->getRange().getOffset(tok.loc); + + UnownedStringSlice lexeme(start, tok.length); + + if (lexeme != ele.value) + { + return false; + } + } + + return true; +} + +static void jsonUnitTest() +{ + SourceManager sourceManager; + sourceManager.initialize(nullptr, nullptr); + DiagnosticSink sink(&sourceManager, nullptr); + + { + const char text[] = " { \"Hello\" : [ \"World\", 1, 2.0, -3.0, -435.5345435, 45e-10, 421.00e+20, 17e1] }"; + + const Element eles[] = + { + {JSONTokenType::LBrace, "{" }, + {JSONTokenType::StringLiteral, "\"Hello\""}, + {JSONTokenType::Colon, ":" }, + {JSONTokenType::LBracket, "[" }, + {JSONTokenType::StringLiteral, "\"World\"" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::IntegerLiteral, "1" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::FloatLiteral, "2.0" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::FloatLiteral, "-3.0" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::FloatLiteral, "-435.5345435" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::FloatLiteral, "45e-10" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::FloatLiteral, "421.00e+20" }, + {JSONTokenType::Comma, "," }, + {JSONTokenType::FloatLiteral, "17e1" }, + {JSONTokenType::RBracket, "]" }, + {JSONTokenType::RBrace, "}" }, + {JSONTokenType::EndOfFile, "" }, + }; + + List<JSONToken> toks; + SLANG_CHECK(SLANG_SUCCEEDED(_lex(text, &sink, toks))); + + SLANG_CHECK(_areEqual(&sourceManager, toks, eles, SLANG_COUNT_OF(eles))); + } + + { + StringEscapeHandler* handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::JSON); + + + { + const auto slice = UnownedStringSlice::fromLiteral("\n\r\b\f\t \"\\/ Some text..."); + + SLANG_CHECK(handler->isEscapingNeeded(slice)); + SLANG_CHECK(!handler->isEscapingNeeded(UnownedStringSlice::fromLiteral("Hello!"))); + + StringBuilder escaped; + handler->appendEscaped(slice, escaped); + + StringBuilder unescaped; + handler->appendUnescaped(escaped.getUnownedSlice(), unescaped); + + SLANG_CHECK(unescaped == slice); + } + + { + uint32_t v = 0x7f; + + StringBuilder buf; + while (v < 0x10000) + { + char work[10] = "\\u"; + + for (Int i = 0; i < 4; ++i) + { + const uint32_t digitValue = (v >> ((3 - i) * 4)) & 0xf; + + char digitC = (digitValue > 9) ? char(digitValue - 10 + 'a') : char(digitValue + '0'); + work[i + 2] = digitC; + } + + buf << UnownedStringSlice(work, 6); + + v += v; + } + + // Decode it + StringBuilder unescaped; + handler->appendUnescaped(buf.getUnownedSlice(), unescaped); + + // Encode it + StringBuilder escaped; + handler->appendEscaped(unescaped.getUnownedSlice(), escaped); + + SLANG_CHECK(escaped == buf); + } + } +} + +SLANG_UNIT_TEST("JSON", jsonUnitTest); |
