Added NativeStringType (#2252)

* #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead.
author: jsmall-nvidia <jsmall@nvidia.com> 2022-05-27 17:28:05 -0400
committer: GitHub <noreply@github.com> 2022-05-27 17:28:05 -0400
commit: 2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
tree: ce4dadbd85a59e52725fa6f92613553cd5b29859 /source/core/slang-string-escape-util.cpp
parent: abb89b3e460e11e8f9a134199c2d559190bfc47e (diff)
1 files changed, 146 insertions, 49 deletions
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 513908c4c..334c1aae5 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -115,32 +115,6 @@ public:
     CppStringEscapeHandler() : Super('"') {}
 };
 
-static char _getHexChar(int v)
-{
-    return (v <= 9) ? char(v + '0') : char(v - 10 + 'A');
-}
-
-static int _getHexDigit(char c)
-{
-    if (c >= '0' && c <= '9')
-    {
-        return c - '0';
-    }
-    else if (c >= 'a' && c <= 'f')
-    {
-        return c - 'a' + 10;
-    }
-    else if (c >= 'A' && c <= 'F')
-    {
-        return c - 'A' + 10;
-    }
-    else
-    {
-        SLANG_ASSERT(!"Not a hex digit");
-        return 0;
-    }
-}
-
 static char _getCppEscapedChar(char c)
 {
     switch (c)
@@ -177,7 +151,6 @@ static char _getCppUnescapedChar(char c)
     }
 }
 
-
 bool CppStringEscapeHandler::isUnescapingNeeeded(const UnownedStringSlice& slice)
 {
     return slice.indexOf('\\') >= 0;
@@ -220,6 +193,9 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
     const char* cur = start;
     const char*const end = slice.end();
 
+    // TODO(JS): A cleverer implementation might support U and u prefixing for unicode characters.
+    // For now we just stick with hex if it's not 'regular' ascii.
+
     for (; cur < end; ++cur)
     {
         const char c = *cur;
@@ -232,6 +208,7 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
             {
                 out.append(start, cur);
             }
+
             out.appendChar('\\');
             out.appendChar(escapedChar);
 
@@ -245,17 +222,56 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
                 out.append(start, cur);
             }
 
-            char buf[5] = "\\0x0";
+            // NOTE! There is a possible flaw around checking 'next' character (used for outputting oct and hex)
+            // If a string is constructed appended in parts, the next character is not available so the problem below can still
+            // occur.
+
+            // Another solution to this problem would be to output "", but that makes some other assumptions
+            // For example Slang doesn't support that style.
+
+            // C++ greedily consumes hex/octal digits. This is a problem if we have bytes
+            // 0, '1' as by default this will output as
+            // "\x001" which is the single character byte 1.
+
+            // Note this claims \x is followed with up to 3 hex digits
+            // https://msdn.microsoft.com/en-us/library/69ze775t.aspx
+            // But the following claims otherwise
+            // https://en.cppreference.com/w/cpp/language/string_literal
+
+            // On testing in Visual Studio hex can indeed be more than 3 digits
+
+            // There is a problem outputting values in hex, because C++ allows *any* amount of hex digits. 
+            // We could work around with \u \U but they are later extensions (C++11) and have other issue
+
+            // The solution taken here is to always output as octal, because octal can be at most 3 digits.
+
+            // Special case handling of 0
+            if (c == 0 && !(cur + 1 < end && CharUtil::isOctalDigit(cur[1])))
+            {
+                // We can just output as (octal) "\0"
+                out.append("\\0");
+            }
+            else
+            {
+                // A slightly more sophisticated implementation could output less digits if needed, if not followed by an octal 
+                // digit, but for now we go simple and output all 3 digits
+
+                const uint32_t v = uint32_t(c);
 
-            buf[3] = _getHexChar((int(c) >> 4) & 0xf);
-            buf[4] = _getHexChar(c & 0xf);
+                char buf[4];
+                buf[0] = '\\';
+                buf[1] = ((v >> 6) & 3) + '0';
+                buf[2] = ((v >> 3) & 7) + '0';
+                buf[3] = ((v >> 0) & 7) + '0';
 
-            out.append(buf, buf + 4);
+                out.append(buf, buf + 4);
+            }
 
             start = cur + 1;
         }
     }
 
+    // Flush anything remaining
     if (start < end)
     {
         out.append(start, end);
@@ -269,16 +285,16 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
     const char* cur = start;
     const char*const end = slice.end();
 
-    for (; cur < end; ++cur)
+    while (cur < end)
     {
         const char c = *cur;
 
         if (c == '\\')
         {
             // Flush
-            if (start < end)
+            if (start < cur)
             {
-                out.append(start, end);
+                out.append(start, cur);
             }
 
             /// Next 
@@ -286,11 +302,14 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
 
             if (cur >= end)
             {
+                // Missing character following '\'
                 return SLANG_FAIL;
             }
 
+            const char nextC = *cur++;
+
             // Need to handle various escape sequence cases
-            switch (*cur)
+            switch (nextC)
             {
                 case '\'':
                 case '\"':
@@ -304,7 +323,7 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                 case 't':
                 case 'v':
                 {
-                    const char unescapedChar = _getCppUnescapedChar(*cur);
+                    const char unescapedChar = _getCppUnescapedChar(nextC);
                     if (unescapedChar == 0)
                     {
                         // Don't know how to unescape that char
@@ -312,14 +331,18 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                     }
                     out.appendChar(unescapedChar);
 
-                    start = cur + 1;
+                    start = cur;
                     break;
                 }
                 case '0': case '1': case '2': case '3': case '4':
                 case '5': case '6': case '7':
                 {
+                    // Rewind back a character, as first digit is the 'nextC'
+                    --cur;
+
+                    // Don't need to check for enough characters, because there must be 1 - the nextC
+
                     // octal escape: up to 3 characters
-                    ++cur;
                     int value = 0;
 
                     const char* octEnd = cur + 3;
@@ -327,33 +350,99 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
 
                     for (; cur < octEnd; ++cur)
                     {
-                        const char d = *cur;
-                        if (d >= '0' && d <= '7')
+                        const int digitValue = CharUtil::getOctalDigitValue(*cur);
+                        if (digitValue < 0)
                         {
-                            value = (value << 3) | (d - '0');
+                            break;
                         }
+                        value = (value << 3) | digitValue; 
                     }
                     out.appendChar(char(value));
 
+                    // Reset start
                     start = cur;
                     break;
                 }
                 case 'x':
                 {
+                    /// In the C++ standard we consume hex digits until we hit a non hex digit
+                    uint32_t value = 0;
+                    for (; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+                    {
+                        const int digitValue = CharUtil::getHexDigitValue(*cur);
+                        if (digitValue < 0)
+                        {
+                            return SLANG_FAIL;
+                        }
+
+                        value = (value << 4) | digitValue;
+                    }
+
+                    // If it's ascii, just output it
+                    if (value < 0x80)
+                    {
+                        out.appendChar(char(value));
+                    }
+                    else
+                    {
+                        // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+                        // but 6 are possible, so lets go large.
+                        const Index maxUtf8EncodeCount = 6;
+
+                        char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+                        int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+                        out.appendInPlace(chars, numChars);
+                    }
+
+                    // Reset start
+                    start = cur;
+                    break;
+                }
+                case 'u':
+                case 'U':
+                {
+                    // u implies 4 hex digits
+                    // U implies 6.
+
+                    // Work out how many digits we need
+                    const Count digitCount = (nextC == 'u') ? 4 : 6;
+
+                    // Do we have enough?
+                    if (end - cur < digitCount)
+                    {
+                        return SLANG_FAIL;
+                    }
+
                     uint32_t value = 0;
-                    for (++cur; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+                    for (Index i = 0; i < digitCount; ++i)
                     {
-                        value = value << 4 | _getHexDigit(*cur);
+                        const int digitValue = CharUtil::getHexDigitValue(cur[i]);
+                        if (digitValue < 0)
+                        {
+                            return SLANG_FAIL;
+                        }
+                        value = (value << 4) | digitValue;
                     }
+                    cur += digitCount;
 
-                    // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
-                    // but 6 are possible, so lets go large.
-                    const Index maxUtf8EncodeCount = 6;
+                    // Encode to Utf8
+                    // If it's ascii, just output it
+                    if (value < 0x80)
+                    {
+                        out.appendChar(char(value));
+                    }
+                    else
+                    {
+                        // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+                        // but 6 are possible, so lets go large.
+                        const Index maxUtf8EncodeCount = 6;
 
-                    char* chars = out.prepareForAppend(maxUtf8EncodeCount);
-                    int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
-                    out.appendInPlace(chars, numChars);
+                        char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+                        int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+                        out.appendInPlace(chars, numChars);
+                    }
 
+                    // Reset start
                     start = cur;
                     break;
                 }
@@ -363,6 +452,11 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                 }
             }
         }
+        else
+        {
+            // Next char
+            ++cur;
+        }
     }
 
     if (start < end)
@@ -850,6 +944,9 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
         case Style::Cpp:    return &g_cppHandler;
         case Style::Space:  return &g_spaceHandler;
         case Style::JSON:   return &g_jsonHandler;
+        // TODO(JS): For now we make Slang language string encoding/decoding the same as C++
+        // That may not be desirable because C++ has a variety of surprising edge cases (for example around \x)
+        case Style::Slang:  return &g_cppHandler;
         default:            return nullptr;
     }
 }
author	jsmall-nvidia <jsmall@nvidia.com>	2022-05-27 17:28:05 -0400
committer	GitHub <noreply@github.com>	2022-05-27 17:28:05 -0400
commit	2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
tree	ce4dadbd85a59e52725fa6f92613553cd5b29859 /source/core/slang-string-escape-util.cpp
parent	abb89b3e460e11e8f9a134199c2d559190bfc47e (diff)