Handle unexpected character in mangled names (#1919)

This change is related to a case where a user saw a `/` character printed as part of a structure field name in output HLSL (which obviously broke downstream compilation). The root cause in that case was that a module that was `import`ed was found via a file system directory path, and thus the module name that the Slang compiler formed included a `/`. Due to other factors the structure field was emitted based on its mangled name (missing name hint), and that meant the bare `/` escaped into the output. That situation implies some other things maybe are questionable: * Could we have avoided a `/` ending up in the module name to begin with? Maybe, but we kind of want to support non-ASCII characters in names in the long run. * Could we have fixed whatever was causing the name hint to be dropped? Maybe, but we don't ever want name hints to be *required* for valid compilation (hence why they are "hints"). Even if we investigate these other issues, it is important that the name mangling scheme only ever produce output that uses a constrained subset of ASCII that we expect most compilation targets can support (GLSL being an annoying outlier because of its rules around `_`). This change adds a path where when mangling a `Name` as part of a mangled symbol name we check for the presence of bytes that aren't allowed and then produce an escaped form instead if needed. Note that the code is still byte-oriented for now aothough in the long-run it might want to be oriented around Unicode code points or extended grapheme clusters.
author: Theresa Foley <tfoleyNV@users.noreply.github.com> 2021-08-11 12:24:43 -0700
committer: GitHub <noreply@github.com> 2021-08-11 12:24:43 -0700
commit: 389d21d982da34815b65b10cae63088c397eecc8 (patch)
tree: 82b1d11a139074845b53bf2971cc0e1c5a2b1f1f /source
parent: 4323a15708cafa6411ec10ef8fefb736f8bda82c (diff)
2 files changed, 83 insertions, 14 deletions
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index 47c584251..84c369d40 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -540,19 +540,28 @@ void CLikeSourceEmitter::appendScrubbedName(const UnownedStringSlice& name, Stri
 
     for(auto c : name)
     {
-        // We will treat a dot character just like an underscore
-        // for the purposes of producing a scrubbed name, so
-        // that we translate `SomeType.someMethod` into
-        // `SomeType_someMethod`.
+        // We will treat a dot character or any path separator
+        // just like an underscore for the purposes of producing
+        // a scrubbed name, so that we translate `SomeType.someMethod`
+        // into `SomeType_someMethod`. This increases the readability
+        // of output code when the input used lots of nesting of
+        // code under types/namespaces/etc.
         //
         // By handling this case at the top of this loop, we
         // ensure that a `.`-turned-`_` is handled just like
         // a `_` in the original name, and will be properly
         // scrubbed for GLSL output.
         //
-        if(c == '.')
+        switch(c)
         {
+        default:
+            break;
+
+        case '.':
+        case '\\':
+        case '/':
             c = '_';
+            break;
         }
 
         if(((c >= 'a') && (c <= 'z'))
diff --git a/source/slang/slang-mangle.cpp b/source/slang/slang-mangle.cpp
index ed38a1261..3b65d196d 100644
--- a/source/slang/slang-mangle.cpp
+++ b/source/slang/slang-mangle.cpp
@@ -41,19 +41,79 @@ namespace Slang
         Name*               name)
     {
         String str = getText(name);
+        Index length = str.getLength();
 
         // If the name consists of only traditional "identifer characters"
-        // (`[a-zA-Z_]`), then we wnat to emit it more or less directly.
+        // (`[a-zA-Z_]`), then we want to emit it more or less directly.
         //
-        // If it contains code points outside that range, we'll need to
-        // do something to encode them. I don't want to deal with
-        // that right now, so I'm going to ignore it.
+        bool allAllowed = true;
+        for (auto c : str)
+        {
+            if (('a' <= c) && (c <= 'z'))   continue;
+            if (('A' <= c) && (c <= 'Z'))   continue;
+            if (('0' <= c) && (c <= '9'))   continue;
+            if (c == '_')                   continue;
 
-        // We prefix the string with its byte length, so that
-        // decoding doesn't have to worry about finding a terminator.
-        Index length = str.getLength();
-        emit(context, length);
-        context->sb.append(str);
+            allAllowed = false;
+            break;
+        }
+        if (allAllowed)
+        {
+            // We prefix the string with its byte length, so that
+            // decoding doesn't have to worry about finding a terminator.
+            //
+            // Note: in this case `length` is the same as the number of
+            // code points and the number of extended grapheme clusters,
+            // since the entire name is within the ASCII subset.
+            //
+            emit(context, length);
+            context->sb.append(str);
+        }
+        else
+        {
+            // Other names that aren't pure ASCII require escaping. We
+            // will use a rather simple escaping scheme where the basic
+            // ASCII alphanumeric code points go through unmodified,
+            // and we use `_` as a kind of escape character.
+            //
+            StringBuilder encoded;
+
+            // TODO: This loop probalby ought to be over code points
+            // rather than bytes.
+            //
+            for (auto c : str)
+            {
+                if (('a' <= c) && (c <= 'z'))   { encoded.append(c); }
+                if (('A' <= c) && (c <= 'Z'))   { encoded.append(c); }
+                if (('0' <= c) && (c <= '9'))   { encoded.append(c); }
+
+                if (c == '_')
+                {
+                    encoded.append("_u");
+                }
+                else
+                {
+                    // Any byte that isn't within the allowed ranges
+                    // we be turned into hex, prefixed with `_` and
+                    // suffixed with `x`.
+                    //
+                    encoded.append("_");
+                    encoded.append(uint32_t((unsigned char)c), 16);
+                    encoded.appendChar('x');
+                }
+            }
+
+            context->sb.append("R");
+            emit(context, encoded.getLength());
+            context->sb.append(encoded);
+        }
+
+        // TODO: This logic does not rule out consecutive underscores,
+        // even though the GLSL target does not support consecutive underscores
+        // (or leading underscores, IIRC) in user identifiers.
+        //
+        // Realistically, that is best dealt with as a quirk of tha particular
+        // target, rather than adding complexity here.
     }
 
     void emitVal(
author	Theresa Foley <tfoleyNV@users.noreply.github.com>	2021-08-11 12:24:43 -0700
committer	GitHub <noreply@github.com>	2021-08-11 12:24:43 -0700
commit	389d21d982da34815b65b10cae63088c397eecc8 (patch)
tree	82b1d11a139074845b53bf2971cc0e1c5a2b1f1f /source
parent	4323a15708cafa6411ec10ef8fefb736f8bda82c (diff)