Added NativeStringType (#2252)

* #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead.
author: jsmall-nvidia <jsmall@nvidia.com> 2022-05-27 17:28:05 -0400
committer: GitHub <noreply@github.com> 2022-05-27 17:28:05 -0400
commit: 2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
tree: ce4dadbd85a59e52725fa6f92613553cd5b29859 /source/core
parent: abb89b3e460e11e8f9a134199c2d559190bfc47e (diff)
8 files changed, 226 insertions, 182 deletions
diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h
index 8f7f69c90..f831f6d55 100644
--- a/source/core/slang-char-util.h
+++ b/source/core/slang-char-util.h
@@ -31,8 +31,12 @@ struct CharUtil
         /// True if it's alpha
     SLANG_FORCE_INLINE static bool isAlpha(char c) { return (getFlags(c) & (Flag::Upper | Flag::Lower)) != 0; }
 
+        /// True if the character is a valid hex character
     SLANG_FORCE_INLINE static bool isHexDigit(char c) { return (getFlags(c) & Flag::HexDigit) != 0; }
 
+        /// True if the character is an octal digit
+    SLANG_FORCE_INLINE static bool isOctalDigit(char c) { return c >= '0' && c <= '7'; }
+
         /// For a given character get the associated flags
     SLANG_FORCE_INLINE static Flags getFlags(char c) { return g_charFlagMap.flags[size_t(c)]; }
 
@@ -41,7 +45,14 @@ struct CharUtil
         /// Given a character return the upper case equivalent
     SLANG_FORCE_INLINE static char toUpper(char c) { return (c >= 'a' && c <= 'z') ? (c -'a' + 'A') : c; }
 
-
+        /// Returns the value if c interpretted as a hex digit
+        /// If c is not a valid hex returns -1
+    inline static int getHexDigitValue(char c);
+    
+        /// Returns the value if c interpretted as a octal digit
+        /// If c is not a valid octal returns -1
+    inline static int getOctalDigitValue(char c) { return isOctalDigit(c) ? (c - '0') : -1; }
+    
     struct CharFlagMap
     {
         Flags flags[0x100];
@@ -57,6 +68,24 @@ struct CharUtil
     static const CharFlagMap g_charFlagMap;
 };
     
+// ------------------------------------------------------------------------------------
+inline /* static */int CharUtil::getHexDigitValue(char c)
+{
+    if (c >= '0' && c <= '9')
+    {
+        return c - '0';
+    }
+    else if (c >= 'a' && c <= 'f')
+    {
+        return c - 'a' + 10;
+    }
+    else if (c >= 'A' && c <= 'F')
+    {
+        return c - 'A' + 10;
+    }
+    return -1;
+}
+
 } // namespace Slang
 
 #endif // SLANG_CHAR_UTIL_H
diff --git a/source/core/slang-hex-dump-util.cpp b/source/core/slang-hex-dump-util.cpp
index b493141a1..1279dc237 100644
--- a/source/core/slang-hex-dump-util.cpp
+++ b/source/core/slang-hex-dump-util.cpp
@@ -5,6 +5,8 @@
 #include "slang-string-util.h"
 #include "slang-writer.h"
 
+#include "slang-char-util.h"
+
 #include "../../slang-com-helper.h"
 #include "slang-hash.h"
 
@@ -152,23 +154,6 @@ SlangResult HexDumpUtil::dumpSourceBytes(const uint8_t* data, size_t dataCount,
     return SLANG_OK;
 }
 
-static int _parseHexDigit(char c)
-{
-    if (c >= '0' && c <= '9')
-    {
-        return c -'0';
-    }
-    else if (c >= 'a' && c <= 'f')
-    {
-        return c - 'a' + 10;
-    }
-    else if (c >= 'A' && c <= 'F')
-    {
-        return c - 'A' + 10;
-    }
-    return -1;
-}
-
 /* static */SlangResult HexDumpUtil::parse(const UnownedStringSlice& lines, List<uint8_t>& outBytes)
 {
     outBytes.clear();
@@ -188,8 +173,8 @@ static int _parseHexDigit(char c)
                 break;
             }
 
-            const int hi = _parseHexDigit(c);
-            const int lo = _parseHexDigit(cur[1]);
+            const int hi = CharUtil::getHexDigitValue(c);
+            const int lo = CharUtil::getHexDigitValue(cur[1]);
             cur += 2;
 
             if (hi < 0 || lo < 0)
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 513908c4c..334c1aae5 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -115,32 +115,6 @@ public:
     CppStringEscapeHandler() : Super('"') {}
 };
 
-static char _getHexChar(int v)
-{
-    return (v <= 9) ? char(v + '0') : char(v - 10 + 'A');
-}
-
-static int _getHexDigit(char c)
-{
-    if (c >= '0' && c <= '9')
-    {
-        return c - '0';
-    }
-    else if (c >= 'a' && c <= 'f')
-    {
-        return c - 'a' + 10;
-    }
-    else if (c >= 'A' && c <= 'F')
-    {
-        return c - 'A' + 10;
-    }
-    else
-    {
-        SLANG_ASSERT(!"Not a hex digit");
-        return 0;
-    }
-}
-
 static char _getCppEscapedChar(char c)
 {
     switch (c)
@@ -177,7 +151,6 @@ static char _getCppUnescapedChar(char c)
     }
 }
 
-
 bool CppStringEscapeHandler::isUnescapingNeeeded(const UnownedStringSlice& slice)
 {
     return slice.indexOf('\\') >= 0;
@@ -220,6 +193,9 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
     const char* cur = start;
     const char*const end = slice.end();
 
+    // TODO(JS): A cleverer implementation might support U and u prefixing for unicode characters.
+    // For now we just stick with hex if it's not 'regular' ascii.
+
     for (; cur < end; ++cur)
     {
         const char c = *cur;
@@ -232,6 +208,7 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
             {
                 out.append(start, cur);
             }
+
             out.appendChar('\\');
             out.appendChar(escapedChar);
 
@@ -245,17 +222,56 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
                 out.append(start, cur);
             }
 
-            char buf[5] = "\\0x0";
+            // NOTE! There is a possible flaw around checking 'next' character (used for outputting oct and hex)
+            // If a string is constructed appended in parts, the next character is not available so the problem below can still
+            // occur.
+
+            // Another solution to this problem would be to output "", but that makes some other assumptions
+            // For example Slang doesn't support that style.
+
+            // C++ greedily consumes hex/octal digits. This is a problem if we have bytes
+            // 0, '1' as by default this will output as
+            // "\x001" which is the single character byte 1.
+
+            // Note this claims \x is followed with up to 3 hex digits
+            // https://msdn.microsoft.com/en-us/library/69ze775t.aspx
+            // But the following claims otherwise
+            // https://en.cppreference.com/w/cpp/language/string_literal
+
+            // On testing in Visual Studio hex can indeed be more than 3 digits
+
+            // There is a problem outputting values in hex, because C++ allows *any* amount of hex digits. 
+            // We could work around with \u \U but they are later extensions (C++11) and have other issue
+
+            // The solution taken here is to always output as octal, because octal can be at most 3 digits.
+
+            // Special case handling of 0
+            if (c == 0 && !(cur + 1 < end && CharUtil::isOctalDigit(cur[1])))
+            {
+                // We can just output as (octal) "\0"
+                out.append("\\0");
+            }
+            else
+            {
+                // A slightly more sophisticated implementation could output less digits if needed, if not followed by an octal 
+                // digit, but for now we go simple and output all 3 digits
+
+                const uint32_t v = uint32_t(c);
 
-            buf[3] = _getHexChar((int(c) >> 4) & 0xf);
-            buf[4] = _getHexChar(c & 0xf);
+                char buf[4];
+                buf[0] = '\\';
+                buf[1] = ((v >> 6) & 3) + '0';
+                buf[2] = ((v >> 3) & 7) + '0';
+                buf[3] = ((v >> 0) & 7) + '0';
 
-            out.append(buf, buf + 4);
+                out.append(buf, buf + 4);
+            }
 
             start = cur + 1;
         }
     }
 
+    // Flush anything remaining
     if (start < end)
     {
         out.append(start, end);
@@ -269,16 +285,16 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
     const char* cur = start;
     const char*const end = slice.end();
 
-    for (; cur < end; ++cur)
+    while (cur < end)
     {
         const char c = *cur;
 
         if (c == '\\')
         {
             // Flush
-            if (start < end)
+            if (start < cur)
             {
-                out.append(start, end);
+                out.append(start, cur);
             }
 
             /// Next 
@@ -286,11 +302,14 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
 
             if (cur >= end)
             {
+                // Missing character following '\'
                 return SLANG_FAIL;
             }
 
+            const char nextC = *cur++;
+
             // Need to handle various escape sequence cases
-            switch (*cur)
+            switch (nextC)
             {
                 case '\'':
                 case '\"':
@@ -304,7 +323,7 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                 case 't':
                 case 'v':
                 {
-                    const char unescapedChar = _getCppUnescapedChar(*cur);
+                    const char unescapedChar = _getCppUnescapedChar(nextC);
                     if (unescapedChar == 0)
                     {
                         // Don't know how to unescape that char
@@ -312,14 +331,18 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                     }
                     out.appendChar(unescapedChar);
 
-                    start = cur + 1;
+                    start = cur;
                     break;
                 }
                 case '0': case '1': case '2': case '3': case '4':
                 case '5': case '6': case '7':
                 {
+                    // Rewind back a character, as first digit is the 'nextC'
+                    --cur;
+
+                    // Don't need to check for enough characters, because there must be 1 - the nextC
+
                     // octal escape: up to 3 characters
-                    ++cur;
                     int value = 0;
 
                     const char* octEnd = cur + 3;
@@ -327,33 +350,99 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
 
                     for (; cur < octEnd; ++cur)
                     {
-                        const char d = *cur;
-                        if (d >= '0' && d <= '7')
+                        const int digitValue = CharUtil::getOctalDigitValue(*cur);
+                        if (digitValue < 0)
                         {
-                            value = (value << 3) | (d - '0');
+                            break;
                         }
+                        value = (value << 3) | digitValue; 
                     }
                     out.appendChar(char(value));
 
+                    // Reset start
                     start = cur;
                     break;
                 }
                 case 'x':
                 {
+                    /// In the C++ standard we consume hex digits until we hit a non hex digit
+                    uint32_t value = 0;
+                    for (; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+                    {
+                        const int digitValue = CharUtil::getHexDigitValue(*cur);
+                        if (digitValue < 0)
+                        {
+                            return SLANG_FAIL;
+                        }
+
+                        value = (value << 4) | digitValue;
+                    }
+
+                    // If it's ascii, just output it
+                    if (value < 0x80)
+                    {
+                        out.appendChar(char(value));
+                    }
+                    else
+                    {
+                        // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+                        // but 6 are possible, so lets go large.
+                        const Index maxUtf8EncodeCount = 6;
+
+                        char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+                        int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+                        out.appendInPlace(chars, numChars);
+                    }
+
+                    // Reset start
+                    start = cur;
+                    break;
+                }
+                case 'u':
+                case 'U':
+                {
+                    // u implies 4 hex digits
+                    // U implies 6.
+
+                    // Work out how many digits we need
+                    const Count digitCount = (nextC == 'u') ? 4 : 6;
+
+                    // Do we have enough?
+                    if (end - cur < digitCount)
+                    {
+                        return SLANG_FAIL;
+                    }
+
                     uint32_t value = 0;
-                    for (++cur; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+                    for (Index i = 0; i < digitCount; ++i)
                     {
-                        value = value << 4 | _getHexDigit(*cur);
+                        const int digitValue = CharUtil::getHexDigitValue(cur[i]);
+                        if (digitValue < 0)
+                        {
+                            return SLANG_FAIL;
+                        }
+                        value = (value << 4) | digitValue;
                     }
+                    cur += digitCount;
 
-                    // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
-                    // but 6 are possible, so lets go large.
-                    const Index maxUtf8EncodeCount = 6;
+                    // Encode to Utf8
+                    // If it's ascii, just output it
+                    if (value < 0x80)
+                    {
+                        out.appendChar(char(value));
+                    }
+                    else
+                    {
+                        // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+                        // but 6 are possible, so lets go large.
+                        const Index maxUtf8EncodeCount = 6;
 
-                    char* chars = out.prepareForAppend(maxUtf8EncodeCount);
-                    int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
-                    out.appendInPlace(chars, numChars);
+                        char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+                        int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+                        out.appendInPlace(chars, numChars);
+                    }
 
+                    // Reset start
                     start = cur;
                     break;
                 }
@@ -363,6 +452,11 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                 }
             }
         }
+        else
+        {
+            // Next char
+            ++cur;
+        }
     }
 
     if (start < end)
@@ -850,6 +944,9 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
         case Style::Cpp:    return &g_cppHandler;
         case Style::Space:  return &g_spaceHandler;
         case Style::JSON:   return &g_jsonHandler;
+        // TODO(JS): For now we make Slang language string encoding/decoding the same as C++
+        // That may not be desirable because C++ has a variety of surprising edge cases (for example around \x)
+        case Style::Slang:  return &g_cppHandler;
         default:            return nullptr;
     }
 }
diff --git a/source/core/slang-string-escape-util.h b/source/core/slang-string-escape-util.h
index 5f749c423..c3a43b623 100644
--- a/source/core/slang-string-escape-util.h
+++ b/source/core/slang-string-escape-util.h
@@ -55,6 +55,8 @@ struct StringEscapeUtil
         Cpp,            ///< Cpp style quoting and escape handling
         Space,          ///< Applies quotes if there are spaces. Does not escape.
         JSON,           ///< Json encoding
+        Slang,          ///< Slang style string encoding (For now same as Cpp but that may change in the future)
+        CountOf,
     };
 
         /// Given a style returns a handler
diff --git a/source/core/slang-string.h b/source/core/slang-string.h
index 6dc3275eb..5119dac8f 100644
--- a/source/core/slang-string.h
+++ b/source/core/slang-string.h
@@ -62,6 +62,10 @@ namespace Slang
         return (((unsigned char)ch) & 0xC0) == 0x80;
     }
 
+    /* A string slice that doesn't own the contained characters.
+    It is the responsibility of code using the type to keep the memory backing 
+    the slice in scope.
+    A slice is generally *not* zero terminated. */
     struct SLANG_RT_API UnownedStringSlice
     {
     public:
@@ -85,15 +89,9 @@ namespace Slang
             , m_end(b + len)
         {}
 
-        char const* begin() const
-        {
-            return m_begin;
-        }
+        SLANG_FORCE_INLINE char const* begin() const { return m_begin; }
 
-        char const* end() const
-        {
-            return m_end;
-        }
+        SLANG_FORCE_INLINE char const* end() const { return m_end; }
 
             /// True if slice is strictly contained in memory.
         bool isMemoryContained(const UnownedStringSlice& slice) const
@@ -105,10 +103,8 @@ namespace Slang
             return pos >= m_begin && pos <= m_end;
         }
 
-        Index getLength() const
-        {
-            return Index(m_end - m_begin);
-        }
+            /// Get the length in *bytes*
+        Count getLength() const { return Index(m_end - m_begin); }
 
             /// Finds first index of char 'c'. If not found returns -1.
         Index indexOf(char c) const;
@@ -179,7 +175,7 @@ namespace Slang
         template <size_t SIZE> 
         SLANG_FORCE_INLINE static UnownedStringSlice fromLiteral(const char (&in)[SIZE]) { return UnownedStringSlice(in, SIZE - 1); }
 
-    private:
+    protected:
         char const* m_begin;
         char const* m_end;
     };
@@ -188,6 +184,40 @@ namespace Slang
     template <size_t SIZE>
     SLANG_FORCE_INLINE UnownedStringSlice toSlice(const char (&in)[SIZE]) { return UnownedStringSlice(in, SIZE - 1); }
 
+    /// Same as UnownedStringSlice, but must be zero terminated. 
+    /// Zero termination is *not* included in the length.
+    struct SLANG_RT_API UnownedTerminatedStringSlice : public UnownedStringSlice
+    {
+    public:
+        typedef UnownedStringSlice Super;
+        typedef UnownedTerminatedStringSlice ThisType;
+
+            /// We can turn into a regular zero terminated string
+        SLANG_FORCE_INLINE operator const char*() const { return m_begin; }
+
+            /// Exists to match the equivalent function in String.
+        SLANG_FORCE_INLINE char const* getBuffer() const { return m_begin; }
+
+            /// Construct from a literal directly.
+        template <size_t SIZE>
+        SLANG_FORCE_INLINE static ThisType fromLiteral(const char(&in)[SIZE]) { return ThisType(in, SIZE - 1); }
+
+            /// Note, b cannot be null because if it were then the string would not be null terminated
+        UnownedTerminatedStringSlice(char const* b)
+            : Super(b, b + strlen(b))
+        {}
+        UnownedTerminatedStringSlice(char const* b, size_t len)
+            : Super(b, len)
+        {
+            // b must be valid and it must be null terminated
+            SLANG_ASSERT(b && b[len] == 0);
+        }
+    };
+
+    // A more convenient way to make terminated slices from *string literals*
+    template <size_t SIZE>
+    SLANG_FORCE_INLINE UnownedTerminatedStringSlice toTerminatedSlice(const char(&in)[SIZE]) { return UnownedTerminatedStringSlice(in, SIZE - 1); }
+
     // A `StringRepresentation` provides the backing storage for
     // all reference-counted string-related types.
     class SLANG_RT_API StringRepresentation : public RefObject
@@ -284,16 +314,6 @@ namespace Slang
 
     class String;
 
-
-
-    struct SLANG_RT_API UnownedTerminatedStringSlice : public UnownedStringSlice
-    {
-    public:
-        UnownedTerminatedStringSlice(char const* b)
-            : UnownedStringSlice(b, b + (b?strlen(b):0))
-        {}
-    };
-
     struct SLANG_RT_API StringSlice
     {
     public:
diff --git a/source/core/slang-token-reader.cpp b/source/core/slang-token-reader.cpp
index 7ffbc12fa..5acc1736c 100644
--- a/source/core/slang-token-reader.cpp
+++ b/source/core/slang-token-reader.cpp
@@ -671,93 +671,6 @@ namespace Misc {
         return TokenizeText("", text);
     }
 
-    String EscapeStringLiteral(String str)
-    {
-        StringBuilder sb;
-        sb << "\"";
-        const Index length = str.getLength();
-        const char*const data = str.getBuffer();
-        for (Index i = 0; i < length; i++)
-        {
-            switch (data[i])
-            {
-            case '\n':
-                sb << "\\n";
-                break;
-            case '\r':
-                sb << "\\r";
-                break;
-            case '\t':
-                sb << "\\t";
-                break;
-            case '\v':
-                sb << "\\v";
-                break;
-            case '\'':
-                sb << "\\\'";
-                break;
-            case '\"':
-                sb << "\\\"";
-                break;
-            case '\\':
-                sb << "\\\\";
-                break;
-            default:
-                sb << data[i];
-                break;
-            }
-        }
-        sb << "\"";
-        return sb.ProduceString();
-    }
-
-    String UnescapeStringLiteral(String str)
-    {
-        StringBuilder sb;
-        const Index length = str.getLength();
-        const char*const data = str.getBuffer();
-        for (Index i = 0; i < length; i++)
-        {
-            if (data[i] == '\\' && i < length - 1)
-            {
-                switch (data[i + 1])
-                {
-                case 's':
-                    sb << " ";
-                    break;
-                case 't':
-                    sb << '\t';
-                    break;
-                case 'n':
-                    sb << '\n';
-                    break;
-                case 'r':
-                    sb << '\r';
-                    break;
-                case 'v':
-                    sb << '\v';
-                    break;
-                case '\'':
-                    sb << '\'';
-                    break;
-                case '\"':
-                    sb << "\"";
-                    break;
-                case '\\':
-                    sb << "\\";
-                    break;
-                default:
-                    i = i - 1;
-                    sb << data[i];
-                }
-                i++;
-            }
-            else
-                sb << data[i];
-        }
-        return sb.ProduceString();
-    }
-
     TokenReader::TokenReader(String text)
     {
         this->tokens = TokenizeText("", text);
diff --git a/source/core/slang-token-reader.h b/source/core/slang-token-reader.h
index bf5ca4cdc..26539732c 100644
--- a/source/core/slang-token-reader.h
+++ b/source/core/slang-token-reader.h
@@ -295,8 +295,6 @@ namespace Misc {
         return result;
     }
 
-    String EscapeStringLiteral(String str);
-    String UnescapeStringLiteral(String str);
 } // namespace Misc
 } // namespace Slang
 
diff --git a/source/core/slang-type-text-util.cpp b/source/core/slang-type-text-util.cpp
index 13bf439ce..454ca4258 100644
--- a/source/core/slang-type-text-util.cpp
+++ b/source/core/slang-type-text-util.cpp
@@ -68,7 +68,7 @@ static const CompileTargetInfo s_compileTargetInfos[] =
     { SLANG_SPIRV_ASM,      "spv.asm",                                          "spirv-asm,spirv-assembly" },
     { SLANG_C_SOURCE,       "c",                                                "c" },
     { SLANG_CPP_SOURCE,     "cpp,c++,cxx",                                      "cpp,c++,cxx" },
-    { SLANG_HOST_CPP_SOURCE, "cpp,c++,cxx",                                     "cpp,c++,cxx"},
+    { SLANG_HOST_CPP_SOURCE, "cpp,c++,cxx",                                     "host-cpp,host-c++,host-cxx"},
     { SLANG_HOST_EXECUTABLE,"exe",                                              "exe,executable" },
     { SLANG_SHADER_SHARED_LIBRARY, "dll,so",                                    "sharedlib,sharedlibrary,dll" },
     { SLANG_CUDA_SOURCE,    "cu",                                               "cuda,cu"  },
author	jsmall-nvidia <jsmall@nvidia.com>	2022-05-27 17:28:05 -0400
committer	GitHub <noreply@github.com>	2022-05-27 17:28:05 -0400
commit	2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
tree	ce4dadbd85a59e52725fa6f92613553cd5b29859 /source/core
parent	abb89b3e460e11e8f9a134199c2d559190bfc47e (diff)