summaryrefslogtreecommitdiffstats
path: root/source/core/slang-string-escape-util.cpp
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2022-05-27 17:28:05 -0400
committerGitHub <noreply@github.com>2022-05-27 17:28:05 -0400
commit2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
treece4dadbd85a59e52725fa6f92613553cd5b29859 /source/core/slang-string-escape-util.cpp
parentabb89b3e460e11e8f9a134199c2d559190bfc47e (diff)
Added NativeStringType (#2252)
* #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead.
Diffstat (limited to 'source/core/slang-string-escape-util.cpp')
-rw-r--r--source/core/slang-string-escape-util.cpp195
1 files changed, 146 insertions, 49 deletions
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 513908c4c..334c1aae5 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -115,32 +115,6 @@ public:
CppStringEscapeHandler() : Super('"') {}
};
-static char _getHexChar(int v)
-{
- return (v <= 9) ? char(v + '0') : char(v - 10 + 'A');
-}
-
-static int _getHexDigit(char c)
-{
- if (c >= '0' && c <= '9')
- {
- return c - '0';
- }
- else if (c >= 'a' && c <= 'f')
- {
- return c - 'a' + 10;
- }
- else if (c >= 'A' && c <= 'F')
- {
- return c - 'A' + 10;
- }
- else
- {
- SLANG_ASSERT(!"Not a hex digit");
- return 0;
- }
-}
-
static char _getCppEscapedChar(char c)
{
switch (c)
@@ -177,7 +151,6 @@ static char _getCppUnescapedChar(char c)
}
}
-
bool CppStringEscapeHandler::isUnescapingNeeeded(const UnownedStringSlice& slice)
{
return slice.indexOf('\\') >= 0;
@@ -220,6 +193,9 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
const char* cur = start;
const char*const end = slice.end();
+ // TODO(JS): A cleverer implementation might support U and u prefixing for unicode characters.
+ // For now we just stick with hex if it's not 'regular' ascii.
+
for (; cur < end; ++cur)
{
const char c = *cur;
@@ -232,6 +208,7 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
{
out.append(start, cur);
}
+
out.appendChar('\\');
out.appendChar(escapedChar);
@@ -245,17 +222,56 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
out.append(start, cur);
}
- char buf[5] = "\\0x0";
+ // NOTE! There is a possible flaw around checking 'next' character (used for outputting oct and hex)
+ // If a string is constructed appended in parts, the next character is not available so the problem below can still
+ // occur.
+
+ // Another solution to this problem would be to output "", but that makes some other assumptions
+ // For example Slang doesn't support that style.
+
+ // C++ greedily consumes hex/octal digits. This is a problem if we have bytes
+ // 0, '1' as by default this will output as
+ // "\x001" which is the single character byte 1.
+
+ // Note this claims \x is followed with up to 3 hex digits
+ // https://msdn.microsoft.com/en-us/library/69ze775t.aspx
+ // But the following claims otherwise
+ // https://en.cppreference.com/w/cpp/language/string_literal
+
+ // On testing in Visual Studio hex can indeed be more than 3 digits
+
+ // There is a problem outputting values in hex, because C++ allows *any* amount of hex digits.
+ // We could work around with \u \U but they are later extensions (C++11) and have other issue
+
+ // The solution taken here is to always output as octal, because octal can be at most 3 digits.
+
+ // Special case handling of 0
+ if (c == 0 && !(cur + 1 < end && CharUtil::isOctalDigit(cur[1])))
+ {
+ // We can just output as (octal) "\0"
+ out.append("\\0");
+ }
+ else
+ {
+ // A slightly more sophisticated implementation could output less digits if needed, if not followed by an octal
+ // digit, but for now we go simple and output all 3 digits
+
+ const uint32_t v = uint32_t(c);
- buf[3] = _getHexChar((int(c) >> 4) & 0xf);
- buf[4] = _getHexChar(c & 0xf);
+ char buf[4];
+ buf[0] = '\\';
+ buf[1] = ((v >> 6) & 3) + '0';
+ buf[2] = ((v >> 3) & 7) + '0';
+ buf[3] = ((v >> 0) & 7) + '0';
- out.append(buf, buf + 4);
+ out.append(buf, buf + 4);
+ }
start = cur + 1;
}
}
+ // Flush anything remaining
if (start < end)
{
out.append(start, end);
@@ -269,16 +285,16 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
const char* cur = start;
const char*const end = slice.end();
- for (; cur < end; ++cur)
+ while (cur < end)
{
const char c = *cur;
if (c == '\\')
{
// Flush
- if (start < end)
+ if (start < cur)
{
- out.append(start, end);
+ out.append(start, cur);
}
/// Next
@@ -286,11 +302,14 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
if (cur >= end)
{
+ // Missing character following '\'
return SLANG_FAIL;
}
+ const char nextC = *cur++;
+
// Need to handle various escape sequence cases
- switch (*cur)
+ switch (nextC)
{
case '\'':
case '\"':
@@ -304,7 +323,7 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
case 't':
case 'v':
{
- const char unescapedChar = _getCppUnescapedChar(*cur);
+ const char unescapedChar = _getCppUnescapedChar(nextC);
if (unescapedChar == 0)
{
// Don't know how to unescape that char
@@ -312,14 +331,18 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
}
out.appendChar(unescapedChar);
- start = cur + 1;
+ start = cur;
break;
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
{
+ // Rewind back a character, as first digit is the 'nextC'
+ --cur;
+
+ // Don't need to check for enough characters, because there must be 1 - the nextC
+
// octal escape: up to 3 characters
- ++cur;
int value = 0;
const char* octEnd = cur + 3;
@@ -327,33 +350,99 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
for (; cur < octEnd; ++cur)
{
- const char d = *cur;
- if (d >= '0' && d <= '7')
+ const int digitValue = CharUtil::getOctalDigitValue(*cur);
+ if (digitValue < 0)
{
- value = (value << 3) | (d - '0');
+ break;
}
+ value = (value << 3) | digitValue;
}
out.appendChar(char(value));
+ // Reset start
start = cur;
break;
}
case 'x':
{
+ /// In the C++ standard we consume hex digits until we hit a non hex digit
+ uint32_t value = 0;
+ for (; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+ {
+ const int digitValue = CharUtil::getHexDigitValue(*cur);
+ if (digitValue < 0)
+ {
+ return SLANG_FAIL;
+ }
+
+ value = (value << 4) | digitValue;
+ }
+
+ // If it's ascii, just output it
+ if (value < 0x80)
+ {
+ out.appendChar(char(value));
+ }
+ else
+ {
+ // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+ // but 6 are possible, so lets go large.
+ const Index maxUtf8EncodeCount = 6;
+
+ char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+ int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+ out.appendInPlace(chars, numChars);
+ }
+
+ // Reset start
+ start = cur;
+ break;
+ }
+ case 'u':
+ case 'U':
+ {
+ // u implies 4 hex digits
+ // U implies 6.
+
+ // Work out how many digits we need
+ const Count digitCount = (nextC == 'u') ? 4 : 6;
+
+ // Do we have enough?
+ if (end - cur < digitCount)
+ {
+ return SLANG_FAIL;
+ }
+
uint32_t value = 0;
- for (++cur; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+ for (Index i = 0; i < digitCount; ++i)
{
- value = value << 4 | _getHexDigit(*cur);
+ const int digitValue = CharUtil::getHexDigitValue(cur[i]);
+ if (digitValue < 0)
+ {
+ return SLANG_FAIL;
+ }
+ value = (value << 4) | digitValue;
}
+ cur += digitCount;
- // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
- // but 6 are possible, so lets go large.
- const Index maxUtf8EncodeCount = 6;
+ // Encode to Utf8
+ // If it's ascii, just output it
+ if (value < 0x80)
+ {
+ out.appendChar(char(value));
+ }
+ else
+ {
+ // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+ // but 6 are possible, so lets go large.
+ const Index maxUtf8EncodeCount = 6;
- char* chars = out.prepareForAppend(maxUtf8EncodeCount);
- int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
- out.appendInPlace(chars, numChars);
+ char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+ int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+ out.appendInPlace(chars, numChars);
+ }
+ // Reset start
start = cur;
break;
}
@@ -363,6 +452,11 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
}
}
}
+ else
+ {
+ // Next char
+ ++cur;
+ }
}
if (start < end)
@@ -850,6 +944,9 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
case Style::Cpp: return &g_cppHandler;
case Style::Space: return &g_spaceHandler;
case Style::JSON: return &g_jsonHandler;
+ // TODO(JS): For now we make Slang language string encoding/decoding the same as C++
+ // That may not be desirable because C++ has a variety of surprising edge cases (for example around \x)
+ case Style::Slang: return &g_cppHandler;
default: return nullptr;
}
}