summaryrefslogtreecommitdiffstats
path: root/source/core/slang-string-escape-util.cpp
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2021-05-25 20:58:43 -0400
committerGitHub <noreply@github.com>2021-05-25 20:58:43 -0400
commit7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source/core/slang-string-escape-util.cpp
parent89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)
JSON Lexing and string encoding/decoding (#1858)
* #include an absolute path didn't work - because paths were taken to always be relative. * WIP Json lexer. * Check JSON Lex with unit test * Add JSON escaping/unescaping of strings. * Big fix encoding/decoding. * Fix typo in JSON diagnostics. * Fix typo. * Better float testing.
Diffstat (limited to 'source/core/slang-string-escape-util.cpp')
-rw-r--r--source/core/slang-string-escape-util.cpp373
1 files changed, 373 insertions, 0 deletions
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 5e4db269c..a91d88e05 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -87,6 +87,8 @@ SlangResult SpaceStringEscapeHandler::appendEscaped(const UnownedStringSlice& sl
}
}
+
+
// !!!!!!!!!!!!!!!!!!!!!!!!!! CppStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
class CppStringEscapeHandler : public StringEscapeHandler
@@ -445,10 +447,380 @@ SlangResult CppStringEscapeHandler::lexQuoted(const char* cursor, const char** o
}
}
+// !!!!!!!!!!!!!!!!!!!!!!!!!! JSONStringEscapeHandler !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+class JSONStringEscapeHandler : public StringEscapeHandler
+{
+public:
+ typedef StringEscapeHandler Super;
+
+ virtual bool isQuotingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE { SLANG_UNUSED(slice); return true; }
+ virtual bool isEscapingNeeded(const UnownedStringSlice& slice) SLANG_OVERRIDE;
+ virtual SlangResult appendEscaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE;
+ virtual SlangResult appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out) SLANG_OVERRIDE;
+ virtual SlangResult lexQuoted(const char* cursor, const char** outCursor) SLANG_OVERRIDE;
+
+ JSONStringEscapeHandler() : Super('"') {}
+};
+
+bool JSONStringEscapeHandler::isEscapingNeeded(const UnownedStringSlice& slice)
+{
+ const char* cur = slice.begin();
+ const char*const end = slice.end();
+
+ for (; cur < end; ++cur)
+ {
+ const char c = *cur;
+
+ switch (c)
+ {
+ case '\"':
+ case '\\':
+ case '/':
+ {
+ return true;
+ }
+ default:
+ {
+ if (c < ' ' || c >= 0x7e)
+ {
+ return true;
+ }
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+SlangResult JSONStringEscapeHandler::lexQuoted(const char* cursor, const char** outCursor)
+{
+ // We've skipped the first "
+ while (true)
+ {
+ const char c = *cursor++;
+
+ switch (c)
+ {
+ case 0: return SLANG_FAIL;
+ case '"':
+ {
+ *outCursor = cursor;
+ return SLANG_OK;
+ }
+ case '\\':
+ {
+ const char nextC = *cursor;
+ switch (nextC)
+ {
+ case '"':
+ case '\\':
+ case '/':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ {
+ ++cursor;
+ break;
+ }
+ case 'u':
+ {
+ cursor++;
+ for (Index i = 0; i < 4; ++i)
+ {
+ if (!CharUtil::isHexDigit(cursor[i]))
+ {
+ return SLANG_FAIL;
+ }
+ }
+ cursor += 4;
+ break;
+ }
+ }
+ }
+ // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+ default: break;
+ }
+ }
+}
+
+static char _getJSONEscapedChar(char c)
+{
+ switch (c)
+ {
+ case '\b': return 'b';
+ case '\f': return 'f';
+ case '\n': return 'n';
+ case '\r': return 'r';
+ case '\t': return 't';
+ case '\\': return '\\';
+ case '/': return '/';
+ case '"': return '"';
+ default: return 0;
+ }
+}
+
+static char _getJSONUnescapedChar(char c)
+{
+ switch (c)
+ {
+ case 'b': return '\b';
+ case 'f': return '\f';
+ case 'n': return '\n';
+ case 'r': return '\r';
+ case 't': return '\t';
+ case '\\': return '\\';
+ case '/': return '/';
+ case '"': return '"';
+ default: return 0;
+ }
+}
+
+static const char s_hex[] = "0123456789abcdef";
+
+// Outputs ioSlice with the chars remaining after utf8 encoded value
+// Returns ~uint32_t(0) if can't decode
+static uint32_t _getUnicodePointFromUTF8(UnownedStringSlice& ioSlice)
+{
+ const Index length = ioSlice.getLength();
+ SLANG_ASSERT(length > 0);
+ const char* cur = ioSlice.begin();
+
+ uint32_t codePoint = 0;
+ unsigned int leading = cur[0];
+ unsigned int mask = 0x80;
+
+ Index count = 0;
+ while (leading & mask)
+ {
+ count++;
+ mask >>= 1;
+ }
+
+ if (count > length)
+ {
+ SLANG_ASSERT(!"Can't decode");
+ ioSlice = UnownedStringSlice(ioSlice.end(), ioSlice.end());
+ return ~uint32_t(0);
+ }
+
+ codePoint = (leading & (mask - 1));
+ for (Index i = 1; i <= count - 1; i++)
+ {
+ codePoint <<= 6;
+ codePoint += (cur[i] & 0x3F);
+ }
+
+ ioSlice = UnownedStringSlice(cur + count, ioSlice.end());
+ return codePoint;
+}
+
+static void _appendHex16(uint32_t value, StringBuilder& out)
+{
+ // Let's go with hex
+ char buf[] = "\\u0000";
+
+ buf[2] = s_hex[(value >> 12) & 0xf];
+ buf[3] = s_hex[(value >> 8) & 0xf];
+ buf[4] = s_hex[(value >> 4) & 0xf];
+ buf[5] = s_hex[(value >> 0) & 0xf];
+
+ out.append(UnownedStringSlice(buf, 6));
+}
+
+SlangResult JSONStringEscapeHandler::appendEscaped(const UnownedStringSlice& slice, StringBuilder& out)
+{
+ const char* start = slice.begin();
+ const char* cur = start;
+ const char*const end = slice.end();
+
+ for (; cur < end; ++cur)
+ {
+ const char c = *cur;
+
+ const char escapedChar = _getJSONEscapedChar(c);
+
+ if (escapedChar)
+ {
+ // Flush
+ if (start < cur)
+ {
+ out.append(start, cur);
+ }
+ out.appendChar('\\');
+ out.appendChar(escapedChar);
+
+ start = cur + 1;
+ }
+ else if (uint8_t(c) & 0x80)
+ {
+ // Flush
+ if (start < cur)
+ {
+ out.append(start, cur);
+ }
+
+ // UTF8
+ UnownedStringSlice remainingSlice(cur, end);
+ uint32_t codePoint = _getUnicodePointFromUTF8(remainingSlice);
+
+ // We only support up to 16 bit unicode values for now...
+ SLANG_ASSERT(codePoint < 0x10000);
+
+ _appendHex16(codePoint, out);
+
+ cur = remainingSlice.begin() - 1;
+ start = cur + 1;
+ }
+ else if (uint8_t(c) < ' ' || (c >= 0x7e))
+ {
+ if (start < cur)
+ {
+ out.append(start, cur);
+ }
+
+ _appendHex16(uint32_t(c), out);
+
+ start = cur + 1;
+ }
+ else
+ {
+ // Can go out as it is
+ }
+ }
+
+ // Flush at the end
+ if (start < end)
+ {
+ out.append(start, end);
+ }
+ return SLANG_OK;
+}
+
+SlangResult JSONStringEscapeHandler::appendUnescaped(const UnownedStringSlice& slice, StringBuilder& out)
+{
+ const char* start = slice.begin();
+ const char* cur = start;
+ const char*const end = slice.end();
+
+ for (; cur < end; ++cur)
+ {
+ const char c = *cur;
+
+ if (c == '\\')
+ {
+ // Flush
+ if (start < cur)
+ {
+ out.append(start, cur);
+ }
+
+ /// Next
+ cur++;
+
+ if (cur >= end)
+ {
+ return SLANG_FAIL;
+ }
+
+ // Need to handle various escape sequence cases
+ switch (*cur)
+ {
+ case '\"':
+ case '\\':
+ case '/':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ {
+ const char unescapedChar = _getJSONUnescapedChar(*cur);
+ if (unescapedChar == 0)
+ {
+ // Don't know how to unescape that char
+ return SLANG_FAIL;
+ }
+ out.appendChar(unescapedChar);
+
+ start = cur + 1;
+ break;
+ }
+ case 'u':
+ {
+ uint32_t value = 0;
+ cur++;
+
+ if (cur + 4 > end)
+ {
+ return SLANG_FAIL;
+ }
+
+ for (Index i = 0; i < 4; ++i)
+ {
+ const char digitC = cur[i];
+
+ uint32_t digitValue;
+ if (digitC >= '0' && digitC <= '9')
+ {
+ digitValue = digitC - '0';
+ }
+ else if (digitC >= 'a' && digitC <= 'f')
+ {
+ digitValue = digitC -'a' + 10;
+ }
+ else if(digitC >= 'A' && digitC <= 'F')
+ {
+ digitValue = digitC - 'A' + 10;
+ }
+ else
+ {
+ return SLANG_FAIL;
+ }
+ SLANG_ASSERT(digitValue < 0x10);
+ value = (value << 4) | digitValue;
+ }
+ cur += 4;
+
+ // NOTE! Strictly speaking we may want to combine 2 UTF16 surrogates to make a single
+ // UTF8 encoded char.
+
+ // Need to encode in UTF8 to concat
+
+ char buf[8];
+ int len = EncodeUnicodePointToUTF8(buf, value);
+
+ out.append(buf, buf + len);
+
+ start = cur;
+ cur--;
+ break;
+ }
+ default:
+ {
+ // Can't decode
+ return SLANG_FAIL;
+ }
+ }
+ }
+ }
+
+ // Flush
+ if (start < end)
+ {
+ out.append(start, end);
+ }
+
+ return SLANG_OK;
+}
+
// !!!!!!!!!!!!!!!!!!!!!!!!!! StringEscapeUtil !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
static CppStringEscapeHandler g_cppHandler;
static SpaceStringEscapeHandler g_spaceHandler;
+static JSONStringEscapeHandler g_jsonHandler;
StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
{
@@ -456,6 +828,7 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
{
case Style::Cpp: return &g_cppHandler;
case Style::Space: return &g_spaceHandler;
+ case Style::JSON: return &g_jsonHandler;
default: return nullptr;
}
}