Added NativeStringType (#2252)

* #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead.
author: jsmall-nvidia <jsmall@nvidia.com> 2022-05-27 17:28:05 -0400
committer: GitHub <noreply@github.com> 2022-05-27 17:28:05 -0400
commit: 2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
tree: ce4dadbd85a59e52725fa6f92613553cd5b29859
parent: abb89b3e460e11e8f9a134199c2d559190bfc47e (diff)
26 files changed, 415 insertions, 293 deletions
diff --git a/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj b/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj
index 9da4294c9..87cd8e9ec 100644
--- a/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj
+++ b/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj
@@ -271,7 +271,6 @@
     <ClInclude Include="..\..\..\tools\unit-test\slang-unit-test.h" />
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="..\..\..\tools\slang-unit-test\unit-offset-container.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-byte-encode.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-chunked-list.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-command-line-args.cpp" />
@@ -281,11 +280,13 @@
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-json-native.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-json.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-memory-arena.cpp" />
+    <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-offset-container.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-path.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-process.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-riff.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-rtti.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-short-list.cpp" />
+    <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-string-escape.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-string.cpp" />
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-translation-unit-import.cpp" />
     <ClCompile Include="..\..\..\tools\unit-test\slang-unit-test.cpp" />
diff --git a/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj.filters b/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj.filters
index 5f935e3f7..4a4e7bce9 100644
--- a/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj.filters
+++ b/build/visual-studio/slang-unit-test-tool/slang-unit-test-tool.vcxproj.filters
@@ -14,9 +14,6 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
-    <ClCompile Include="..\..\..\tools\slang-unit-test\unit-offset-container.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-byte-encode.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -44,6 +41,9 @@
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-memory-arena.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-offset-container.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-path.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -59,6 +59,9 @@
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-short-list.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-string-escape.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\tools\slang-unit-test\unit-test-string.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h
index 8f7f69c90..f831f6d55 100644
--- a/source/core/slang-char-util.h
+++ b/source/core/slang-char-util.h
@@ -31,8 +31,12 @@ struct CharUtil
         /// True if it's alpha
     SLANG_FORCE_INLINE static bool isAlpha(char c) { return (getFlags(c) & (Flag::Upper | Flag::Lower)) != 0; }
 
+        /// True if the character is a valid hex character
     SLANG_FORCE_INLINE static bool isHexDigit(char c) { return (getFlags(c) & Flag::HexDigit) != 0; }
 
+        /// True if the character is an octal digit
+    SLANG_FORCE_INLINE static bool isOctalDigit(char c) { return c >= '0' && c <= '7'; }
+
         /// For a given character get the associated flags
     SLANG_FORCE_INLINE static Flags getFlags(char c) { return g_charFlagMap.flags[size_t(c)]; }
 
@@ -41,7 +45,14 @@ struct CharUtil
         /// Given a character return the upper case equivalent
     SLANG_FORCE_INLINE static char toUpper(char c) { return (c >= 'a' && c <= 'z') ? (c -'a' + 'A') : c; }
 
-
+        /// Returns the value if c interpretted as a hex digit
+        /// If c is not a valid hex returns -1
+    inline static int getHexDigitValue(char c);
+    
+        /// Returns the value if c interpretted as a octal digit
+        /// If c is not a valid octal returns -1
+    inline static int getOctalDigitValue(char c) { return isOctalDigit(c) ? (c - '0') : -1; }
+    
     struct CharFlagMap
     {
         Flags flags[0x100];
@@ -57,6 +68,24 @@ struct CharUtil
     static const CharFlagMap g_charFlagMap;
 };
     
+// ------------------------------------------------------------------------------------
+inline /* static */int CharUtil::getHexDigitValue(char c)
+{
+    if (c >= '0' && c <= '9')
+    {
+        return c - '0';
+    }
+    else if (c >= 'a' && c <= 'f')
+    {
+        return c - 'a' + 10;
+    }
+    else if (c >= 'A' && c <= 'F')
+    {
+        return c - 'A' + 10;
+    }
+    return -1;
+}
+
 } // namespace Slang
 
 #endif // SLANG_CHAR_UTIL_H
diff --git a/source/core/slang-hex-dump-util.cpp b/source/core/slang-hex-dump-util.cpp
index b493141a1..1279dc237 100644
--- a/source/core/slang-hex-dump-util.cpp
+++ b/source/core/slang-hex-dump-util.cpp
@@ -5,6 +5,8 @@
 #include "slang-string-util.h"
 #include "slang-writer.h"
 
+#include "slang-char-util.h"
+
 #include "../../slang-com-helper.h"
 #include "slang-hash.h"
 
@@ -152,23 +154,6 @@ SlangResult HexDumpUtil::dumpSourceBytes(const uint8_t* data, size_t dataCount,
     return SLANG_OK;
 }
 
-static int _parseHexDigit(char c)
-{
-    if (c >= '0' && c <= '9')
-    {
-        return c -'0';
-    }
-    else if (c >= 'a' && c <= 'f')
-    {
-        return c - 'a' + 10;
-    }
-    else if (c >= 'A' && c <= 'F')
-    {
-        return c - 'A' + 10;
-    }
-    return -1;
-}
-
 /* static */SlangResult HexDumpUtil::parse(const UnownedStringSlice& lines, List<uint8_t>& outBytes)
 {
     outBytes.clear();
@@ -188,8 +173,8 @@ static int _parseHexDigit(char c)
                 break;
             }
 
-            const int hi = _parseHexDigit(c);
-            const int lo = _parseHexDigit(cur[1]);
+            const int hi = CharUtil::getHexDigitValue(c);
+            const int lo = CharUtil::getHexDigitValue(cur[1]);
             cur += 2;
 
             if (hi < 0 || lo < 0)
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 513908c4c..334c1aae5 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -115,32 +115,6 @@ public:
     CppStringEscapeHandler() : Super('"') {}
 };
 
-static char _getHexChar(int v)
-{
-    return (v <= 9) ? char(v + '0') : char(v - 10 + 'A');
-}
-
-static int _getHexDigit(char c)
-{
-    if (c >= '0' && c <= '9')
-    {
-        return c - '0';
-    }
-    else if (c >= 'a' && c <= 'f')
-    {
-        return c - 'a' + 10;
-    }
-    else if (c >= 'A' && c <= 'F')
-    {
-        return c - 'A' + 10;
-    }
-    else
-    {
-        SLANG_ASSERT(!"Not a hex digit");
-        return 0;
-    }
-}
-
 static char _getCppEscapedChar(char c)
 {
     switch (c)
@@ -177,7 +151,6 @@ static char _getCppUnescapedChar(char c)
     }
 }
 
-
 bool CppStringEscapeHandler::isUnescapingNeeeded(const UnownedStringSlice& slice)
 {
     return slice.indexOf('\\') >= 0;
@@ -220,6 +193,9 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
     const char* cur = start;
     const char*const end = slice.end();
 
+    // TODO(JS): A cleverer implementation might support U and u prefixing for unicode characters.
+    // For now we just stick with hex if it's not 'regular' ascii.
+
     for (; cur < end; ++cur)
     {
         const char c = *cur;
@@ -232,6 +208,7 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
             {
                 out.append(start, cur);
             }
+
             out.appendChar('\\');
             out.appendChar(escapedChar);
 
@@ -245,17 +222,56 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
                 out.append(start, cur);
             }
 
-            char buf[5] = "\\0x0";
+            // NOTE! There is a possible flaw around checking 'next' character (used for outputting oct and hex)
+            // If a string is constructed appended in parts, the next character is not available so the problem below can still
+            // occur.
+
+            // Another solution to this problem would be to output "", but that makes some other assumptions
+            // For example Slang doesn't support that style.
+
+            // C++ greedily consumes hex/octal digits. This is a problem if we have bytes
+            // 0, '1' as by default this will output as
+            // "\x001" which is the single character byte 1.
+
+            // Note this claims \x is followed with up to 3 hex digits
+            // https://msdn.microsoft.com/en-us/library/69ze775t.aspx
+            // But the following claims otherwise
+            // https://en.cppreference.com/w/cpp/language/string_literal
+
+            // On testing in Visual Studio hex can indeed be more than 3 digits
+
+            // There is a problem outputting values in hex, because C++ allows *any* amount of hex digits. 
+            // We could work around with \u \U but they are later extensions (C++11) and have other issue
+
+            // The solution taken here is to always output as octal, because octal can be at most 3 digits.
+
+            // Special case handling of 0
+            if (c == 0 && !(cur + 1 < end && CharUtil::isOctalDigit(cur[1])))
+            {
+                // We can just output as (octal) "\0"
+                out.append("\\0");
+            }
+            else
+            {
+                // A slightly more sophisticated implementation could output less digits if needed, if not followed by an octal 
+                // digit, but for now we go simple and output all 3 digits
+
+                const uint32_t v = uint32_t(c);
 
-            buf[3] = _getHexChar((int(c) >> 4) & 0xf);
-            buf[4] = _getHexChar(c & 0xf);
+                char buf[4];
+                buf[0] = '\\';
+                buf[1] = ((v >> 6) & 3) + '0';
+                buf[2] = ((v >> 3) & 7) + '0';
+                buf[3] = ((v >> 0) & 7) + '0';
 
-            out.append(buf, buf + 4);
+                out.append(buf, buf + 4);
+            }
 
             start = cur + 1;
         }
     }
 
+    // Flush anything remaining
     if (start < end)
     {
         out.append(start, end);
@@ -269,16 +285,16 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
     const char* cur = start;
     const char*const end = slice.end();
 
-    for (; cur < end; ++cur)
+    while (cur < end)
     {
         const char c = *cur;
 
         if (c == '\\')
         {
             // Flush
-            if (start < end)
+            if (start < cur)
             {
-                out.append(start, end);
+                out.append(start, cur);
             }
 
             /// Next 
@@ -286,11 +302,14 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
 
             if (cur >= end)
             {
+                // Missing character following '\'
                 return SLANG_FAIL;
             }
 
+            const char nextC = *cur++;
+
             // Need to handle various escape sequence cases
-            switch (*cur)
+            switch (nextC)
             {
                 case '\'':
                 case '\"':
@@ -304,7 +323,7 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                 case 't':
                 case 'v':
                 {
-                    const char unescapedChar = _getCppUnescapedChar(*cur);
+                    const char unescapedChar = _getCppUnescapedChar(nextC);
                     if (unescapedChar == 0)
                     {
                         // Don't know how to unescape that char
@@ -312,14 +331,18 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                     }
                     out.appendChar(unescapedChar);
 
-                    start = cur + 1;
+                    start = cur;
                     break;
                 }
                 case '0': case '1': case '2': case '3': case '4':
                 case '5': case '6': case '7':
                 {
+                    // Rewind back a character, as first digit is the 'nextC'
+                    --cur;
+
+                    // Don't need to check for enough characters, because there must be 1 - the nextC
+
                     // octal escape: up to 3 characters
-                    ++cur;
                     int value = 0;
 
                     const char* octEnd = cur + 3;
@@ -327,33 +350,99 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
 
                     for (; cur < octEnd; ++cur)
                     {
-                        const char d = *cur;
-                        if (d >= '0' && d <= '7')
+                        const int digitValue = CharUtil::getOctalDigitValue(*cur);
+                        if (digitValue < 0)
                         {
-                            value = (value << 3) | (d - '0');
+                            break;
                         }
+                        value = (value << 3) | digitValue; 
                     }
                     out.appendChar(char(value));
 
+                    // Reset start
                     start = cur;
                     break;
                 }
                 case 'x':
                 {
+                    /// In the C++ standard we consume hex digits until we hit a non hex digit
+                    uint32_t value = 0;
+                    for (; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+                    {
+                        const int digitValue = CharUtil::getHexDigitValue(*cur);
+                        if (digitValue < 0)
+                        {
+                            return SLANG_FAIL;
+                        }
+
+                        value = (value << 4) | digitValue;
+                    }
+
+                    // If it's ascii, just output it
+                    if (value < 0x80)
+                    {
+                        out.appendChar(char(value));
+                    }
+                    else
+                    {
+                        // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+                        // but 6 are possible, so lets go large.
+                        const Index maxUtf8EncodeCount = 6;
+
+                        char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+                        int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+                        out.appendInPlace(chars, numChars);
+                    }
+
+                    // Reset start
+                    start = cur;
+                    break;
+                }
+                case 'u':
+                case 'U':
+                {
+                    // u implies 4 hex digits
+                    // U implies 6.
+
+                    // Work out how many digits we need
+                    const Count digitCount = (nextC == 'u') ? 4 : 6;
+
+                    // Do we have enough?
+                    if (end - cur < digitCount)
+                    {
+                        return SLANG_FAIL;
+                    }
+
                     uint32_t value = 0;
-                    for (++cur; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+                    for (Index i = 0; i < digitCount; ++i)
                     {
-                        value = value << 4 | _getHexDigit(*cur);
+                        const int digitValue = CharUtil::getHexDigitValue(cur[i]);
+                        if (digitValue < 0)
+                        {
+                            return SLANG_FAIL;
+                        }
+                        value = (value << 4) | digitValue;
                     }
+                    cur += digitCount;
 
-                    // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
-                    // but 6 are possible, so lets go large.
-                    const Index maxUtf8EncodeCount = 6;
+                    // Encode to Utf8
+                    // If it's ascii, just output it
+                    if (value < 0x80)
+                    {
+                        out.appendChar(char(value));
+                    }
+                    else
+                    {
+                        // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+                        // but 6 are possible, so lets go large.
+                        const Index maxUtf8EncodeCount = 6;
 
-                    char* chars = out.prepareForAppend(maxUtf8EncodeCount);
-                    int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
-                    out.appendInPlace(chars, numChars);
+                        char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+                        int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+                        out.appendInPlace(chars, numChars);
+                    }
 
+                    // Reset start
                     start = cur;
                     break;
                 }
@@ -363,6 +452,11 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
                 }
             }
         }
+        else
+        {
+            // Next char
+            ++cur;
+        }
     }
 
     if (start < end)
@@ -850,6 +944,9 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
         case Style::Cpp:    return &g_cppHandler;
         case Style::Space:  return &g_spaceHandler;
         case Style::JSON:   return &g_jsonHandler;
+        // TODO(JS): For now we make Slang language string encoding/decoding the same as C++
+        // That may not be desirable because C++ has a variety of surprising edge cases (for example around \x)
+        case Style::Slang:  return &g_cppHandler;
         default:            return nullptr;
     }
 }
diff --git a/source/core/slang-string-escape-util.h b/source/core/slang-string-escape-util.h
index 5f749c423..c3a43b623 100644
--- a/source/core/slang-string-escape-util.h
+++ b/source/core/slang-string-escape-util.h
@@ -55,6 +55,8 @@ struct StringEscapeUtil
         Cpp,            ///< Cpp style quoting and escape handling
         Space,          ///< Applies quotes if there are spaces. Does not escape.
         JSON,           ///< Json encoding
+        Slang,          ///< Slang style string encoding (For now same as Cpp but that may change in the future)
+        CountOf,
     };
 
         /// Given a style returns a handler
diff --git a/source/core/slang-string.h b/source/core/slang-string.h
index 6dc3275eb..5119dac8f 100644
--- a/source/core/slang-string.h
+++ b/source/core/slang-string.h
@@ -62,6 +62,10 @@ namespace Slang
         return (((unsigned char)ch) & 0xC0) == 0x80;
     }
 
+    /* A string slice that doesn't own the contained characters.
+    It is the responsibility of code using the type to keep the memory backing 
+    the slice in scope.
+    A slice is generally *not* zero terminated. */
     struct SLANG_RT_API UnownedStringSlice
     {
     public:
@@ -85,15 +89,9 @@ namespace Slang
             , m_end(b + len)
         {}
 
-        char const* begin() const
-        {
-            return m_begin;
-        }
+        SLANG_FORCE_INLINE char const* begin() const { return m_begin; }
 
-        char const* end() const
-        {
-            return m_end;
-        }
+        SLANG_FORCE_INLINE char const* end() const { return m_end; }
 
             /// True if slice is strictly contained in memory.
         bool isMemoryContained(const UnownedStringSlice& slice) const
@@ -105,10 +103,8 @@ namespace Slang
             return pos >= m_begin && pos <= m_end;
         }
 
-        Index getLength() const
-        {
-            return Index(m_end - m_begin);
-        }
+            /// Get the length in *bytes*
+        Count getLength() const { return Index(m_end - m_begin); }
 
             /// Finds first index of char 'c'. If not found returns -1.
         Index indexOf(char c) const;
@@ -179,7 +175,7 @@ namespace Slang
         template <size_t SIZE> 
         SLANG_FORCE_INLINE static UnownedStringSlice fromLiteral(const char (&in)[SIZE]) { return UnownedStringSlice(in, SIZE - 1); }
 
-    private:
+    protected:
         char const* m_begin;
         char const* m_end;
     };
@@ -188,6 +184,40 @@ namespace Slang
     template <size_t SIZE>
     SLANG_FORCE_INLINE UnownedStringSlice toSlice(const char (&in)[SIZE]) { return UnownedStringSlice(in, SIZE - 1); }
 
+    /// Same as UnownedStringSlice, but must be zero terminated. 
+    /// Zero termination is *not* included in the length.
+    struct SLANG_RT_API UnownedTerminatedStringSlice : public UnownedStringSlice
+    {
+    public:
+        typedef UnownedStringSlice Super;
+        typedef UnownedTerminatedStringSlice ThisType;
+
+            /// We can turn into a regular zero terminated string
+        SLANG_FORCE_INLINE operator const char*() const { return m_begin; }
+
+            /// Exists to match the equivalent function in String.
+        SLANG_FORCE_INLINE char const* getBuffer() const { return m_begin; }
+
+            /// Construct from a literal directly.
+        template <size_t SIZE>
+        SLANG_FORCE_INLINE static ThisType fromLiteral(const char(&in)[SIZE]) { return ThisType(in, SIZE - 1); }
+
+            /// Note, b cannot be null because if it were then the string would not be null terminated
+        UnownedTerminatedStringSlice(char const* b)
+            : Super(b, b + strlen(b))
+        {}
+        UnownedTerminatedStringSlice(char const* b, size_t len)
+            : Super(b, len)
+        {
+            // b must be valid and it must be null terminated
+            SLANG_ASSERT(b && b[len] == 0);
+        }
+    };
+
+    // A more convenient way to make terminated slices from *string literals*
+    template <size_t SIZE>
+    SLANG_FORCE_INLINE UnownedTerminatedStringSlice toTerminatedSlice(const char(&in)[SIZE]) { return UnownedTerminatedStringSlice(in, SIZE - 1); }
+
     // A `StringRepresentation` provides the backing storage for
     // all reference-counted string-related types.
     class SLANG_RT_API StringRepresentation : public RefObject
@@ -284,16 +314,6 @@ namespace Slang
 
     class String;
 
-
-
-    struct SLANG_RT_API UnownedTerminatedStringSlice : public UnownedStringSlice
-    {
-    public:
-        UnownedTerminatedStringSlice(char const* b)
-            : UnownedStringSlice(b, b + (b?strlen(b):0))
-        {}
-    };
-
     struct SLANG_RT_API StringSlice
     {
     public:
diff --git a/source/core/slang-token-reader.cpp b/source/core/slang-token-reader.cpp
index 7ffbc12fa..5acc1736c 100644
--- a/source/core/slang-token-reader.cpp
+++ b/source/core/slang-token-reader.cpp
@@ -671,93 +671,6 @@ namespace Misc {
         return TokenizeText("", text);
     }
 
-    String EscapeStringLiteral(String str)
-    {
-        StringBuilder sb;
-        sb << "\"";
-        const Index length = str.getLength();
-        const char*const data = str.getBuffer();
-        for (Index i = 0; i < length; i++)
-        {
-            switch (data[i])
-            {
-            case '\n':
-                sb << "\\n";
-                break;
-            case '\r':
-                sb << "\\r";
-                break;
-            case '\t':
-                sb << "\\t";
-                break;
-            case '\v':
-                sb << "\\v";
-                break;
-            case '\'':
-                sb << "\\\'";
-                break;
-            case '\"':
-                sb << "\\\"";
-                break;
-            case '\\':
-                sb << "\\\\";
-                break;
-            default:
-                sb << data[i];
-                break;
-            }
-        }
-        sb << "\"";
-        return sb.ProduceString();
-    }
-
-    String UnescapeStringLiteral(String str)
-    {
-        StringBuilder sb;
-        const Index length = str.getLength();
-        const char*const data = str.getBuffer();
-        for (Index i = 0; i < length; i++)
-        {
-            if (data[i] == '\\' && i < length - 1)
-            {
-                switch (data[i + 1])
-                {
-                case 's':
-                    sb << " ";
-                    break;
-                case 't':
-                    sb << '\t';
-                    break;
-                case 'n':
-                    sb << '\n';
-                    break;
-                case 'r':
-                    sb << '\r';
-                    break;
-                case 'v':
-                    sb << '\v';
-                    break;
-                case '\'':
-                    sb << '\'';
-                    break;
-                case '\"':
-                    sb << "\"";
-                    break;
-                case '\\':
-                    sb << "\\";
-                    break;
-                default:
-                    i = i - 1;
-                    sb << data[i];
-                }
-                i++;
-            }
-            else
-                sb << data[i];
-        }
-        return sb.ProduceString();
-    }
-
     TokenReader::TokenReader(String text)
     {
         this->tokens = TokenizeText("", text);
diff --git a/source/core/slang-token-reader.h b/source/core/slang-token-reader.h
index bf5ca4cdc..26539732c 100644
--- a/source/core/slang-token-reader.h
+++ b/source/core/slang-token-reader.h
@@ -295,8 +295,6 @@ namespace Misc {
         return result;
     }
 
-    String EscapeStringLiteral(String str);
-    String UnescapeStringLiteral(String str);
 } // namespace Misc
 } // namespace Slang
 
diff --git a/source/core/slang-type-text-util.cpp b/source/core/slang-type-text-util.cpp
index 13bf439ce..454ca4258 100644
--- a/source/core/slang-type-text-util.cpp
+++ b/source/core/slang-type-text-util.cpp
@@ -68,7 +68,7 @@ static const CompileTargetInfo s_compileTargetInfos[] =
     { SLANG_SPIRV_ASM,      "spv.asm",                                          "spirv-asm,spirv-assembly" },
     { SLANG_C_SOURCE,       "c",                                                "c" },
     { SLANG_CPP_SOURCE,     "cpp,c++,cxx",                                      "cpp,c++,cxx" },
-    { SLANG_HOST_CPP_SOURCE, "cpp,c++,cxx",                                     "cpp,c++,cxx"},
+    { SLANG_HOST_CPP_SOURCE, "cpp,c++,cxx",                                     "host-cpp,host-c++,host-cxx"},
     { SLANG_HOST_EXECUTABLE,"exe",                                              "exe,executable" },
     { SLANG_SHADER_SHARED_LIBRARY, "dll,so",                                    "sharedlib,sharedlibrary,dll" },
     { SLANG_CUDA_SOURCE,    "cu",                                               "cuda,cu"  },
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 476e88e3f..41cfea6af 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -378,6 +378,11 @@ __intrinsic_type($(kIROp_StringType))
 struct String
 {};
 
+__magic_type(NativeStringType)
+__intrinsic_type($(kIROp_NativeStringType))
+struct NativeString
+{};
+
 __magic_type(DynamicType)
 __intrinsic_type($(kIROp_DynamicType))
 struct __Dynamic
diff --git a/source/slang/slang-ast-builder.cpp b/source/slang/slang-ast-builder.cpp
index caf4c020d..7ac039187 100644
--- a/source/slang/slang-ast-builder.cpp
+++ b/source/slang/slang-ast-builder.cpp
@@ -90,6 +90,16 @@ Type* SharedASTBuilder::getStringType()
     return m_stringType;
 }
 
+Type* SharedASTBuilder::getNativeStringType()
+{
+    if (!m_nativeStringType)
+    {
+        auto nativeStringTypeDecl = findMagicDecl("NativeStringType");
+        m_nativeStringType = DeclRefType::create(m_astBuilder, makeDeclRef<Decl>(nativeStringTypeDecl));
+    }
+    return m_nativeStringType;
+}
+
 Type* SharedASTBuilder::getEnumTypeType()
 {
     if (!m_enumTypeType)
diff --git a/source/slang/slang-ast-builder.h b/source/slang/slang-ast-builder.h
index 0642455c3..97aefd118 100644
--- a/source/slang/slang-ast-builder.h
+++ b/source/slang/slang-ast-builder.h
@@ -23,6 +23,10 @@ public:
 
         /// Get the string type
     Type* getStringType();
+
+        /// Get the native string type
+    Type* getNativeStringType();
+
         /// Get the enum type type
     Type* getEnumTypeType();
         /// Get the __Dynamic type
@@ -65,6 +69,7 @@ protected:
     // TODO(tfoley): These should really belong to the compilation context!
     //
     Type* m_stringType = nullptr;
+    Type* m_nativeStringType = nullptr;
     Type* m_enumTypeType = nullptr;
     Type* m_dynamicType = nullptr;
     Type* m_nullPtrType = nullptr;
diff --git a/source/slang/slang-ast-type.h b/source/slang/slang-ast-type.h
index fee7f7cac..7aa1a36ab 100644
--- a/source/slang/slang-ast-type.h
+++ b/source/slang/slang-ast-type.h
@@ -460,12 +460,24 @@ private:
     Type* rowType = nullptr;
 };
 
-// The built-in `String` type
-class StringType : public BuiltinType 
+// Base class for built in string types
+class StringTypeBase : public BuiltinType
+{
+    SLANG_AST_CLASS(StringTypeBase)
+};
+
+// The regular built-in `String` type
+class StringType : public StringTypeBase
 {
     SLANG_AST_CLASS(StringType)
 };
 
+// The string type native to the target
+class NativeStringType : public StringTypeBase
+{
+    SLANG_AST_CLASS(NativeStringType)
+};
+
 // The built-in `__Dynamic` type
 class DynamicType : public BuiltinType
 {
diff --git a/source/slang/slang-check-conversion.cpp b/source/slang/slang-check-conversion.cpp
index a1935d65c..44bb8a610 100644
--- a/source/slang/slang-check-conversion.cpp
+++ b/source/slang/slang-check-conversion.cpp
@@ -639,6 +639,16 @@ namespace Slang
             return true;
         }
 
+        // If both are string types we assume they are convertable in both directions
+        if (as<StringTypeBase>(fromType) && as<StringTypeBase>(toType))
+        {
+            if (outToExpr)
+                *outToExpr = fromExpr;
+            if (outCost)
+                *outCost = kConversionCost_None;
+            return true;
+        }
+
         // Another important case is when either the "to" or "from" type
         // represents an error. In such a case we must have already
         // reporeted the error, so it is better to allow the conversion
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 9887f1ba6..482ada394 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -532,6 +532,11 @@ SlangResult CPPSourceEmitter::calcTypeName(IRType* type, CodeGenTarget target, S
             out << "TypeInfo*";
             return SLANG_OK;
         }
+        case kIROp_NativeStringType:
+        {
+            out << "const char*";
+            return SLANG_OK;
+        }
         case kIROp_StringType:
         {
             out << "String";
@@ -2411,8 +2416,15 @@ bool CPPSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOut
         }
         case kIROp_StringLit:
         {
-            m_writer->emit("String(");
-            m_writer->emit(Slang::Misc::EscapeStringLiteral(as<IRStringLit>(inst)->getStringSlice()));
+            m_writer->emit("toTerminatedSlice(");
+
+            auto handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::Cpp);
+            
+            StringBuilder buf;
+            const auto slice = as<IRStringLit>(inst)->getStringSlice();
+            StringEscapeUtil::appendQuoted(handler, slice, buf);
+            m_writer->emit(buf);
+
             m_writer->emit(")");
             return true;
         }
diff --git a/source/slang/slang-emit-glsl.cpp b/source/slang/slang-emit-glsl.cpp
index c1bbf813b..b0c10fdc2 100644
--- a/source/slang/slang-emit-glsl.cpp
+++ b/source/slang/slang-emit-glsl.cpp
@@ -1967,7 +1967,12 @@ void GLSLSourceEmitter::emitSimpleTypeImpl(IRType* type)
             }
             return;
         }
-        case kIROp_StringType: m_writer->emit("int"); return;
+        case kIROp_NativeStringType:
+        case kIROp_StringType: 
+        {
+            m_writer->emit("int"); 
+            return;
+        }
         default: break;
     }
 
diff --git a/source/slang/slang-emit-hlsl.cpp b/source/slang/slang-emit-hlsl.cpp
index 2d42aef83..48fe86fff 100644
--- a/source/slang/slang-emit-hlsl.cpp
+++ b/source/slang/slang-emit-hlsl.cpp
@@ -853,7 +853,12 @@ void HLSLSourceEmitter::emitSimpleTypeImpl(IRType* type)
             }
             return;
         }
-        case kIROp_StringType: m_writer->emit("int"); return;
+        case kIROp_NativeStringType:
+        case kIROp_StringType: 
+        {
+            m_writer->emit("int"); 
+            return;
+        }
         default: break;
     }
 
diff --git a/source/slang/slang-ir-collect-global-uniforms.cpp b/source/slang/slang-ir-collect-global-uniforms.cpp
index 87b21c819..ca5e56b53 100644
--- a/source/slang/slang-ir-collect-global-uniforms.cpp
+++ b/source/slang/slang-ir-collect-global-uniforms.cpp
@@ -69,6 +69,11 @@ struct CollectGlobalUniformParametersContext
     //
     void processModule()
     {
+        if (!globalScopeVarLayout)
+        {
+            return;
+        }
+
         // We start by looking at the layout that was computed for the global-scope
         // parameters to determine how the parameters are supposed to be pacakged.
         //
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index f9e0a5f34..c617a0218 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -24,7 +24,10 @@ INST(Nop, nop, 0, 0)
 
     INST_RANGE(BasicType, VoidType, AfterBaseType)
 
-    INST(StringType, String, 0, 0)
+    /* StringTypeBase */
+        INST(StringType, String, 0, 0)
+        INST(NativeStringType, NativeString, 0, 0)
+    INST_RANGE(StringTypeBase, StringType, NativeStringType)
 
     INST(CapabilitySetType, CapabilitySet, 0, 0)
 
diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h
index 0e54802e5..77b3eabc0 100644
--- a/source/slang/slang-ir-insts.h
+++ b/source/slang/slang-ir-insts.h
@@ -2080,6 +2080,7 @@ public:
     IRBasicType* getUInt64Type();
     IRBasicType* getCharType();
     IRStringType* getStringType();
+    IRNativeStringType* getNativeStringType();
 
     IRType* getCapabilitySetType();
 
diff --git a/source/slang/slang-ir-link.cpp b/source/slang/slang-ir-link.cpp
index b67d95abf..7984c5037 100644
--- a/source/slang/slang-ir-link.cpp
+++ b/source/slang/slang-ir-link.cpp
@@ -1477,10 +1477,13 @@ LinkedIR linkIR(
     // need to operate on all the global parameters can do so.
     //
     IRVarLayout* irGlobalScopeVarLayout = nullptr;
-    if( auto irGlobalScopeLayoutDecoration = irModuleForLayout->getModuleInst()->findDecoration<IRLayoutDecoration>() )
+    if (irModuleForLayout)
     {
-        auto irOriginalGlobalScopeVarLayout = irGlobalScopeLayoutDecoration->getLayout();
-        irGlobalScopeVarLayout = cast<IRVarLayout>(cloneValue(context, irOriginalGlobalScopeVarLayout));
+        if( auto irGlobalScopeLayoutDecoration = irModuleForLayout->getModuleInst()->findDecoration<IRLayoutDecoration>() )
+        {
+            auto irOriginalGlobalScopeVarLayout = irGlobalScopeLayoutDecoration->getLayout();
+            irGlobalScopeVarLayout = cast<IRVarLayout>(cloneValue(context, irOriginalGlobalScopeVarLayout));
+        }
     }
 
     // Bindings for global generic parameters are currently represented
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index 9de2f5b4f..d454333e6 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -2508,6 +2508,12 @@ namespace Slang
         return (IRStringType*)getType(kIROp_StringType);
     }
 
+    IRNativeStringType* IRBuilder::getNativeStringType()
+    {
+        return (IRNativeStringType*)getType(kIROp_NativeStringType);
+    }
+
+
     IRType* IRBuilder::getCapabilitySetType()
     {
         return getType(kIROp_CapabilitySetType);
@@ -4676,106 +4682,13 @@ namespace Slang
         dumpDebugID(context, inst);
     }
     
-    struct StringEncoder
-    {
-        static char getHexChar(int v)
-        {
-            return (v <= 9) ? char(v + '0') : char(v - 10 + 'A');
-        }
-
-        void flush(const char* pos)
-        {
-            if (pos > m_runStart)
-            {
-                m_builder->append(m_runStart, pos);
-            }
-            m_runStart = pos + 1;
-        }
-
-        void appendEscapedChar(const char* pos, char encodeChar)
-        {
-            flush(pos);
-            const char chars[] = { '\\', encodeChar };
-            m_builder->Append(chars, 2);
-        }
-        
-        void appendAsHex(const char* pos)
-        {
-            flush(pos);
-
-            const int v = *(const uint8_t*)pos;
-
-            char buf[5];
-            buf[0] = '\\';
-            buf[1] = 'x';
-            buf[2] = '0';
-
-            buf[3] = getHexChar(v >> 4);
-            buf[4] = getHexChar(v & 0xf);
-
-            m_builder->Append(buf, 5);
-        }
-
-        StringEncoder(StringBuilder* builder, const char* start):
-            m_runStart(start),
-            m_builder(builder)
-        {}
-
-        StringBuilder* m_builder;
-        const char* m_runStart;
-    };
-
     static void dumpEncodeString(
         IRDumpContext*  context, 
         const UnownedStringSlice& slice)
     {
-        // https://msdn.microsoft.com/en-us/library/69ze775t.aspx
-
+        auto handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::Slang);
         StringBuilder& builder = *context->builder;
-        builder.Append('"');
-        
-        {
-            const char* cur = slice.begin();
-            StringEncoder encoder(&builder, cur);
-            const char* end = slice.end();
-
-            for (; cur < end; cur++)
-            {
-                const int8_t c = uint8_t(*cur);
-                switch (c)
-                {
-                    case '\\':
-                        encoder.appendEscapedChar(cur, '\\');
-                        break;
-                    case '"':
-                        encoder.appendEscapedChar(cur, '"');
-                        break;
-                    case '\n': 
-                        encoder.appendEscapedChar(cur, 'n');
-                        break;
-                    case '\t':
-                        encoder.appendEscapedChar(cur, 't');
-                        break;
-                    case '\r':
-                        encoder.appendEscapedChar(cur, 'r');
-                        break;
-                    case '\0':
-                        encoder.appendEscapedChar(cur, '0');
-                        break;
-                    default:
-                    {
-                        if (c < 32)
-                        {
-                            encoder.appendAsHex(cur);
-                        }
-                        break;
-                    }
-                }
-            }
-            encoder.flush(end);
-        }
-        
-        builder.Append('"');
+        StringEscapeUtil::appendQuoted(handler, slice, builder);
     }
 
     static void dumpType(
diff --git a/source/slang/slang-ir.h b/source/slang/slang-ir.h
index 7a1a0b8aa..6c766542f 100644
--- a/source/slang/slang-ir.h
+++ b/source/slang/slang-ir.h
@@ -786,7 +786,13 @@ struct IRBoolType : IRBasicType
     IR_LEAF_ISA(BoolType)
 };
 
-SIMPLE_IR_TYPE(StringType, Type)
+struct IRStringTypeBase : IRType
+{
+    IR_PARENT_ISA(StringTypeBase)
+};
+
+SIMPLE_IR_TYPE(StringType, StringTypeBase)
+SIMPLE_IR_TYPE(NativeStringType, StringTypeBase)
 
 SIMPLE_IR_TYPE(DynamicType, Type)
 
diff --git a/tools/slang-unit-test/unit-offset-container.cpp b/tools/slang-unit-test/unit-test-offset-container.cpp
index 6a179c319..9d8e3a9ff 100644
--- a/tools/slang-unit-test/unit-offset-container.cpp
+++ b/tools/slang-unit-test/unit-test-offset-container.cpp
@@ -1,4 +1,4 @@
-// unit-test-path.cpp
+// unit-test-offset-container.cpp
 
 #include "../../source/core/slang-offset-container.h"
 
diff --git a/tools/slang-unit-test/unit-test-string-escape.cpp b/tools/slang-unit-test/unit-test-string-escape.cpp
new file mode 100644
index 000000000..337573081
--- /dev/null
+++ b/tools/slang-unit-test/unit-test-string-escape.cpp
@@ -0,0 +1,79 @@
+// unit-test-string-escape.cpp
+
+#include "../../source/core/slang-string-escape-util.h"
+
+#include "tools/unit-test/slang-unit-test.h"
+
+using namespace Slang;
+
+static bool _checkConversion(StringEscapeHandler* handler, const UnownedStringSlice& check)
+{
+	StringBuilder buf;
+	handler->appendEscaped(check, buf);
+
+	StringBuilder decode;
+	handler->appendUnescaped(buf.getUnownedSlice(), decode);
+
+	return decode == check;
+}
+
+static bool _checkDecode(const UnownedStringSlice& encoded, const UnownedStringSlice& decoded)
+{
+	auto handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::Cpp);
+
+	StringBuilder buf;
+	StringEscapeUtil::appendUnquoted(handler, encoded, buf);
+	return buf == decoded;
+}
+
+#define SLANG_ENCODED_DECODED(x) \
+		const auto encoded = toSlice(#x); \
+		const auto decoded = toSlice(x); 
+
+SLANG_UNIT_TEST(StringEscape)
+{
+	// Check greedy hex digits
+	{
+		// \x can have any number of hex digits
+		const char text[] = "\x000001";
+		SLANG_ASSERT(SLANG_COUNT_OF(text) == 2 && text[0] == 1);
+	}
+
+	// Check octal greedy
+	{
+		//\ + up to 3 octal digits
+		const char text[] = "\0011";
+		SLANG_ASSERT(SLANG_COUNT_OF(text) == 3 && text[0] == 1 && text[1] == '1');
+
+		const char text2[] = "\78";
+		SLANG_ASSERT(SLANG_COUNT_OF(text2) == 3 && text2[0] == 7 && text2[1] == '8');
+	}
+
+	{
+		auto handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::Cpp);
+
+		SLANG_CHECK(_checkConversion(handler, toSlice("\0\1\2""2")));
+	}
+
+	{
+		auto handler = StringEscapeUtil::getHandler(StringEscapeUtil::Style::Cpp);
+
+		// We can't just use '\uxxxx', because it has to be translatable into an output character in MSVC (not into utf8)
+		// Can make work perhaps with something like
+		// #pragma execution_character_set("utf-8") 
+		// But for now we don't worry
+		// 
+		// Visual Studio does not appear to support '\U' by default, presumably because wchar_t is 16 bits 
+
+		{
+			SLANG_ENCODED_DECODED("\a\b\0hey~\u0023\n\0");
+			SLANG_CHECK(_checkDecode(encoded, decoded));
+		}
+
+		{
+			SLANG_ENCODED_DECODED("\n\v\b\t\1\02\003\x5z\x00007f\0");
+			SLANG_CHECK(_checkDecode(encoded, decoded));
+		}
+	}
+}
+
author	jsmall-nvidia <jsmall@nvidia.com>	2022-05-27 17:28:05 -0400
committer	GitHub <noreply@github.com>	2022-05-27 17:28:05 -0400
commit	2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
tree	ce4dadbd85a59e52725fa6f92613553cd5b29859
parent	abb89b3e460e11e8f9a134199c2d559190bfc47e (diff)