diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2021-05-25 20:58:43 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-05-25 20:58:43 -0400 |
| commit | 7d1b8ac13faf80ed56b37243480d097059da5aab (patch) | |
| tree | 6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source/core/slang-string-escape-util.cpp | |
| parent | 89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff) | |
JSON Lexing and string encoding/decoding (#1858)
* #include an absolute path didn't work - because paths were taken to always be relative.
* WIP Json lexer.
* Check JSON Lex with unit test
* Add JSON escaping/unescaping of strings.
* Big fix encoding/decoding.
* Fix typo in JSON diagnostics.
* Fix typo.
* Better float testing.
Diffstat (limited to 'source/core/slang-string-escape-util.cpp')
| -rw-r--r-- | source/core/slang-string-escape-util.cpp | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp index 5e4db269c..a91d88e05 100644 --- a/source/core/slang-string-escape-util.cpp +++ b/source/core/slang-string-escape-util.cpp @@ -87,6 +87,8 @@ SlangResult SpaceStringEscapeHandler::appendEscaped(const UnownedStringSlice& sl } } + + // !!!!!!!!!!!!!!!!!!!!!!!!!! CppStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! class CppStringEscapeHandler : public StringEscapeHandler @@ -445,10 +447,380 @@ SlangResult CppStringEscapeHandler::lexQuoted(const char* cursor, const char** o } } +// !!!!!!!!!!!!!!!!!!!!!!!!!! JSONStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + +class JSONStringEscapeHandler : public StringEscapeHandler +{ +public: + typedef StringEscapeHandler Super; + + virtual bool isQuotingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE { SLANG_UNUSED(slice); return true; } + virtual bool isEscapingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE; + virtual SlangResult appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE; + virtual SlangResult appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE; + virtual SlangResult lexQuoted(const char* cursor, const char** outCursor) SLANG_OVERRIDE; + + JSONStringEscapeHandler() : Super('"') {} +}; + +bool JSONStringEscapeHandler::isEscapingNeeded(const UnownedStringSlice& slice) +{ + const char* cur = slice.begin(); + const char*const end = slice.end(); + + for (; cur < end; ++cur) + { + const char c = *cur; + + switch (c) + { + case '\"': + case '\\': + case '/': + { + return true; + } + default: + { + if (c < ' ' || c >= 0x7e) + { + return true; + } + break; + } + } + } + return false; +} + +SlangResult JSONStringEscapeHandler::lexQuoted(const char* cursor, const char** outCursor) +{ + // We've skipped the first " + while (true) + { + const char c = *cursor++; + + switch (c) + { + case 0: return SLANG_FAIL; + case '"': + { + *outCursor = cursor; + return SLANG_OK; + } + case '\\': + { + const char nextC = *cursor; + switch (nextC) + { + case '"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + { + ++cursor; + break; + } + case 'u': + { + cursor++; + for (Index i = 0; i < 4; ++i) + { + if (!CharUtil::isHexDigit(cursor[i])) + { + return SLANG_FAIL; + } + } + cursor += 4; + break; + } + } + } + // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes. + default: break; + } + } +} + +static char _getJSONEscapedChar(char c) +{ + switch (c) + { + case '\b': return 'b'; + case '\f': return 'f'; + case '\n': return 'n'; + case '\r': return 'r'; + case '\t': return 't'; + case '\\': return '\\'; + case '/': return '/'; + case '"': return '"'; + default: return 0; + } +} + +static char _getJSONUnescapedChar(char c) +{ + switch (c) + { + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case '\\': return '\\'; + case '/': return '/'; + case '"': return '"'; + default: return 0; + } +} + +static const char s_hex[] = "0123456789abcdef"; + +// Outputs ioSlice with the chars remaining after utf8 encoded value +// Returns ~uint32_t(0) if can't decode +static uint32_t _getUnicodePointFromUTF8(UnownedStringSlice& ioSlice) +{ + const Index length = ioSlice.getLength(); + SLANG_ASSERT(length > 0); + const char* cur = ioSlice.begin(); + + uint32_t codePoint = 0; + unsigned int leading = cur[0]; + unsigned int mask = 0x80; + + Index count = 0; + while (leading & mask) + { + count++; + mask >>= 1; + } + + if (count > length) + { + SLANG_ASSERT(!"Can't decode"); + ioSlice = UnownedStringSlice(ioSlice.end(), ioSlice.end()); + return ~uint32_t(0); + } + + codePoint = (leading & (mask - 1)); + for (Index i = 1; i <= count - 1; i++) + { + codePoint <<= 6; + codePoint += (cur[i] & 0x3F); + } + + ioSlice = UnownedStringSlice(cur + count, ioSlice.end()); + return codePoint; +} + +static void _appendHex16(uint32_t value, StringBuilder& out) +{ + // Let's go with hex + char buf[] = "\\u0000"; + + buf[2] = s_hex[(value >> 12) & 0xf]; + buf[3] = s_hex[(value >> 8) & 0xf]; + buf[4] = s_hex[(value >> 4) & 0xf]; + buf[5] = s_hex[(value >> 0) & 0xf]; + + out.append(UnownedStringSlice(buf, 6)); +} + +SlangResult JSONStringEscapeHandler::appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) +{ + const char* start = slice.begin(); + const char* cur = start; + const char*const end = slice.end(); + + for (; cur < end; ++cur) + { + const char c = *cur; + + const char escapedChar = _getJSONEscapedChar(c); + + if (escapedChar) + { + // Flush + if (start < cur) + { + out.append(start, cur); + } + out.appendChar('\\'); + out.appendChar(escapedChar); + + start = cur + 1; + } + else if (uint8_t(c) & 0x80) + { + // Flush + if (start < cur) + { + out.append(start, cur); + } + + // UTF8 + UnownedStringSlice remainingSlice(cur, end); + uint32_t codePoint = _getUnicodePointFromUTF8(remainingSlice); + + // We only support up to 16 bit unicode values for now... + SLANG_ASSERT(codePoint < 0x10000); + + _appendHex16(codePoint, out); + + cur = remainingSlice.begin() - 1; + start = cur + 1; + } + else if (uint8_t(c) < ' ' || (c >= 0x7e)) + { + if (start < cur) + { + out.append(start, cur); + } + + _appendHex16(uint32_t(c), out); + + start = cur + 1; + } + else + { + // Can go out as it is + } + } + + // Flush at the end + if (start < end) + { + out.append(start, end); + } + return SLANG_OK; +} + +SlangResult JSONStringEscapeHandler::appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) +{ + const char* start = slice.begin(); + const char* cur = start; + const char*const end = slice.end(); + + for (; cur < end; ++cur) + { + const char c = *cur; + + if (c == '\\') + { + // Flush + if (start < cur) + { + out.append(start, cur); + } + + /// Next + cur++; + + if (cur >= end) + { + return SLANG_FAIL; + } + + // Need to handle various escape sequence cases + switch (*cur) + { + case '\"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + { + const char unescapedChar = _getJSONUnescapedChar(*cur); + if (unescapedChar == 0) + { + // Don't know how to unescape that char + return SLANG_FAIL; + } + out.appendChar(unescapedChar); + + start = cur + 1; + break; + } + case 'u': + { + uint32_t value = 0; + cur++; + + if (cur + 4 > end) + { + return SLANG_FAIL; + } + + for (Index i = 0; i < 4; ++i) + { + const char digitC = cur[i]; + + uint32_t digitValue; + if (digitC >= '0' && digitC <= '9') + { + digitValue = digitC - '0'; + } + else if (digitC >= 'a' && digitC <= 'f') + { + digitValue = digitC -'a' + 10; + } + else if(digitC >= 'A' && digitC <= 'F') + { + digitValue = digitC - 'A' + 10; + } + else + { + return SLANG_FAIL; + } + SLANG_ASSERT(digitValue < 0x10); + value = (value << 4) | digitValue; + } + cur += 4; + + // NOTE! Strictly speaking we may want to combine 2 UTF16 surrogates to make a single + // UTF8 encoded char. + + // Need to encode in UTF8 to concat + + char buf[8]; + int len = EncodeUnicodePointToUTF8(buf, value); + + out.append(buf, buf + len); + + start = cur; + cur--; + break; + } + default: + { + // Can't decode + return SLANG_FAIL; + } + } + } + } + + // Flush + if (start < end) + { + out.append(start, end); + } + + return SLANG_OK; +} + // !!!!!!!!!!!!!!!!!!!!!!!!!! StringEscapeUtil !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! static CppStringEscapeHandler g_cppHandler; static SpaceStringEscapeHandler g_spaceHandler; +static JSONStringEscapeHandler g_jsonHandler; StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style) { @@ -456,6 +828,7 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style) { case Style::Cpp: return &g_cppHandler; case Style::Space: return &g_spaceHandler; + case Style::JSON: return &g_jsonHandler; default: return nullptr; } } |
