From 291b4cd82cebeed39d8c06c8208fc415dfa32a48 Mon Sep 17 00:00:00 2001
From: cheneym2 <acheney@nvidia.com>
Date: Tue, 14 May 2024 14:05:58 -0400
Subject: Slang: Support UTF-8 with Byte Order Markers (#4135)

Slang APIs are documented as taking UTF-8 encoded shader source,
though it's not explicitly documented whether it is allowed to
include a BOM (Byte Order Marker). This change adds support for
UTF-8 BOM markers by virtue of disposing of BOM data. As a bonus,
UTF-16 input which can cleanly decode to UTF-8 is now also
accepted.

Throwing out the BOM on input is done by leveraging existing
functionality in "determineEncoding()", however a bug exists there
for null-terminated single character input, where the null byte
caused a heuristic to guess UTF-16, even though the null byte
isn't part of the string. The bug in "determineEncoding" is fixed
by only guessing when bytes >= 2 and not looking past the end
of the buffer. The 'implicit-cast' test was mistakenly relying
on the bug to pass, as its expected file was being read as UTF16
and cropped to zero length due to the bug. The expected output
of implicit-cast is updated to pass with the bug fix in place.

The decoding of UTF-16 to UTF-8 is done through an existing
'decode' method. This change fixes a bug in UTF16-LE 'decode'
where it was decoded as if it were Big-Endian.

Adds 3 small tests to ensure the compiler doesn't choke on source
files in UTF-8 (with BOM), UTF16-LE, or UTF16-BE.

Bonus: Fixes a bug in diagnostic reporting where hex values were
incorrectly translated to text, leading to incorrect, possibly
truncated strings.

Fixes #4046

Co-authored-by: Yong He <yonghe@outlook.com>
---
 source/compiler-core/slang-json-lexer.cpp   |   2 +-
 source/compiler-core/slang-source-loc.cpp   |  31 ++++++++++++++++++++++------
 source/core/slang-char-encode.cpp           |  25 +++++++++++-----------
 tests/bugs/implicit-cast.slang.expected.txt |   5 ++++-
 tests/preprocessor/utf16_be_bom_crlf.slang  | Bin 0 -> 70 bytes
 tests/preprocessor/utf16_le_bom_crlf.slang  | Bin 0 -> 70 bytes
 tests/preprocessor/utf8_bom_crlf.slang      |   4 ++++
 7 files changed, 46 insertions(+), 21 deletions(-)
 create mode 100644 tests/preprocessor/utf16_be_bom_crlf.slang
 create mode 100644 tests/preprocessor/utf16_le_bom_crlf.slang
 create mode 100644 tests/preprocessor/utf8_bom_crlf.slang

diff --git a/source/compiler-core/slang-json-lexer.cpp b/source/compiler-core/slang-json-lexer.cpp
index 0476ca37a..a335403e1 100644
--- a/source/compiler-core/slang-json-lexer.cpp
+++ b/source/compiler-core/slang-json-lexer.cpp
@@ -238,7 +238,7 @@ JSONTokenType JSONLexer::advance()
                 StringBuilder buf;
                 if (c <= ' ' || c >= 0x7e)
                 {
-                    static const char s_hex[] = "012345679abcdef";
+                    static const char s_hex[] = "0123456789abcdef";
 
                     char hexBuf[5] = "0x";
       
diff --git a/source/compiler-core/slang-source-loc.cpp b/source/compiler-core/slang-source-loc.cpp
index 872c40f0d..75601b815 100644
--- a/source/compiler-core/slang-source-loc.cpp
+++ b/source/compiler-core/slang-source-loc.cpp
@@ -573,15 +573,34 @@ int SourceFile::calcColumnIndex(int lineIndex, int offset, int tabSize)
 
 void SourceFile::setContents(ISlangBlob* blob)
 {
-    const UInt contentSize = blob->getBufferSize();
+    const UInt rawContentSize = blob->getBufferSize();
 
-    SLANG_ASSERT(contentSize == m_contentSize);
+    SLANG_ASSERT(rawContentSize == m_contentSize);
 
-    char const* contentBegin = (char const*)blob->getBufferPointer();
-    char const* contentEnd = contentBegin + contentSize;
+    Byte* rawContentBegin = (Byte*)blob->getBufferPointer();
 
-    m_contentBlob = blob;
-    m_content = UnownedStringSlice(contentBegin, contentEnd);
+    // Query the encoding type and discard the Unicode Byte-Order-Marker before decoding
+    size_t offset;
+    auto type = CharEncoding::determineEncoding(
+        rawContentBegin,
+        rawContentSize,
+        offset);
+    SLANG_ASSERT(rawContentSize >= offset);
+
+    List<char> decodedBuffer;
+    CharEncoding::getEncoding(type)->decode(
+        rawContentBegin + offset,
+        int(rawContentSize - offset),
+        decodedBuffer);
+
+    m_contentBlob = RawBlob::create(decodedBuffer.getBuffer(), decodedBuffer.getCount());
+
+    char const* decodedContentBegin = (char const*)m_contentBlob->getBufferPointer();
+    const UInt decodedContentSize = m_contentBlob->getBufferSize();
+    assert(decodedContentSize <= rawContentSize);
+    char const* decodedContentEnd = decodedContentBegin + decodedContentSize;
+
+    m_content = UnownedStringSlice(decodedContentBegin, decodedContentEnd);
 }
 
 void SourceFile::setContents(const String& content)
diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp
index 687040fa2..105cfac7f 100644
--- a/source/core/slang-char-encode.cpp
+++ b/source/core/slang-char-encode.cpp
@@ -92,17 +92,17 @@ public:
 		Index index = 0;
 		while (index < length)
 		{
-			const Char32 codePoint = getUnicodePointFromUTF16([&]() -> Byte
+			auto readByte = [&]() -> Byte
 			{
-                if (index < length)
-                    return bytes[index++];
-                else
-                    return Byte(0);
-			});
+				return (index < length) ? bytes[index++] : Byte(0);
+			};
+			const Char32 codePoint = m_reverseOrder ?
+				getUnicodePointFromUTF16Reversed(readByte) :
+				getUnicodePointFromUTF16(readByte);
 
 			char buf[5];
 			int count = encodeUnicodePointToUTF8(codePoint, buf);
-            ioBuffer.addRange((const char*)buf, count);
+			ioBuffer.addRange((const char*)buf, count);
 		}
 	}
 
@@ -134,11 +134,10 @@ private:
             outOffset = 2;
             return CharEncodeType::UTF16Reversed;
         }
-    }
-    else
-    {
-        // If we don't have a 'mark' byte then we are bit stumped. We'll look for a null bytes and assume they mean we have a 16 bit encoding
-        for (size_t i = 0; i < bytesCount; i += 2)
+
+        // If we don't have a 'mark' byte then we are bit stumped. We'll look for
+        // null (non-terminator) bytes and assume they mean we have a 16-bit encoding
+        for(size_t i = 0; i < (bytesCount-1); i += 2)
         {
 #if SLANG_LITTLE_ENDIAN
             const auto low = bytes[i];
@@ -146,7 +145,7 @@ private:
 #else
             const auto low = bytes[i + 1];
             const auto high = bytes[i];
-#endif 
+#endif
             if ((low == 0) ^ (high == 0))
             {
                 outOffset = 2;
diff --git a/tests/bugs/implicit-cast.slang.expected.txt b/tests/bugs/implicit-cast.slang.expected.txt
index d8263ee98..5a4d7b6ab 100644
--- a/tests/bugs/implicit-cast.slang.expected.txt
+++ b/tests/bugs/implicit-cast.slang.expected.txt
@@ -1 +1,4 @@
-2
\ No newline at end of file
+0
+1
+0
+0
diff --git a/tests/preprocessor/utf16_be_bom_crlf.slang b/tests/preprocessor/utf16_be_bom_crlf.slang
new file mode 100644
index 000000000..eba197eff
Binary files /dev/null and b/tests/preprocessor/utf16_be_bom_crlf.slang differ
diff --git a/tests/preprocessor/utf16_le_bom_crlf.slang b/tests/preprocessor/utf16_le_bom_crlf.slang
new file mode 100644
index 000000000..e0005e1e7
Binary files /dev/null and b/tests/preprocessor/utf16_le_bom_crlf.slang differ
diff --git a/tests/preprocessor/utf8_bom_crlf.slang b/tests/preprocessor/utf8_bom_crlf.slang
new file mode 100644
index 000000000..bc8a6b12b
--- /dev/null
+++ b/tests/preprocessor/utf8_bom_crlf.slang
@@ -0,0 +1,4 @@
+﻿void main()
+{
+}
+//TEST:SIMPLE:
-- 
cgit v1.2.3