summaryrefslogtreecommitdiffstats
path: root/source/compiler-core/slang-source-loc.cpp
diff options
context:
space:
mode:
authorcheneym2 <acheney@nvidia.com>2024-05-14 14:05:58 -0400
committerGitHub <noreply@github.com>2024-05-14 11:05:58 -0700
commit291b4cd82cebeed39d8c06c8208fc415dfa32a48 (patch)
treeaa3f61e2fbfa760e61f00e015800939579961dd4 /source/compiler-core/slang-source-loc.cpp
parent9ab24cfa4ef2a5b4572a28eff5a65266416b0a88 (diff)
Slang: Support UTF-8 with Byte Order Markers (#4135)
Slang APIs are documented as taking UTF-8 encoded shader source, though it's not explicitly documented whether it is allowed to include a BOM (Byte Order Marker). This change adds support for UTF-8 BOM markers by virtue of disposing of BOM data. As a bonus, UTF-16 input which can cleanly decode to UTF-8 is now also accepted. Throwing out the BOM on input is done by leveraging existing functionality in "determineEncoding()", however a bug exists there for null-terminated single character input, where the null byte caused a heuristic to guess UTF-16, even though the null byte isn't part of the string. The bug in "determineEncoding" is fixed by only guessing when bytes >= 2 and not looking past the end of the buffer. The 'implicit-cast' test was mistakenly relying on the bug to pass, as its expected file was being read as UTF16 and cropped to zero length due to the bug. The expected output of implicit-cast is updated to pass with the bug fix in place. The decoding of UTF-16 to UTF-8 is done through an existing 'decode' method. This change fixes a bug in UTF16-LE 'decode' where it was decoded as if it were Big-Endian. Adds 3 small tests to ensure the compiler doesn't choke on source files in UTF-8 (with BOM), UTF16-LE, or UTF16-BE. Bonus: Fixes a bug in diagnostic reporting where hex values were incorrectly translated to text, leading to incorrect, possibly truncated strings. Fixes #4046 Co-authored-by: Yong He <yonghe@outlook.com>
Diffstat (limited to 'source/compiler-core/slang-source-loc.cpp')
-rw-r--r--source/compiler-core/slang-source-loc.cpp31
1 files changed, 25 insertions, 6 deletions
diff --git a/source/compiler-core/slang-source-loc.cpp b/source/compiler-core/slang-source-loc.cpp
index 872c40f0d..75601b815 100644
--- a/source/compiler-core/slang-source-loc.cpp
+++ b/source/compiler-core/slang-source-loc.cpp
@@ -573,15 +573,34 @@ int SourceFile::calcColumnIndex(int lineIndex, int offset, int tabSize)
void SourceFile::setContents(ISlangBlob* blob)
{
- const UInt contentSize = blob->getBufferSize();
+ const UInt rawContentSize = blob->getBufferSize();
- SLANG_ASSERT(contentSize == m_contentSize);
+ SLANG_ASSERT(rawContentSize == m_contentSize);
- char const* contentBegin = (char const*)blob->getBufferPointer();
- char const* contentEnd = contentBegin + contentSize;
+ Byte* rawContentBegin = (Byte*)blob->getBufferPointer();
- m_contentBlob = blob;
- m_content = UnownedStringSlice(contentBegin, contentEnd);
+ // Query the encoding type and discard the Unicode Byte-Order-Marker before decoding
+ size_t offset;
+ auto type = CharEncoding::determineEncoding(
+ rawContentBegin,
+ rawContentSize,
+ offset);
+ SLANG_ASSERT(rawContentSize >= offset);
+
+ List<char> decodedBuffer;
+ CharEncoding::getEncoding(type)->decode(
+ rawContentBegin + offset,
+ int(rawContentSize - offset),
+ decodedBuffer);
+
+ m_contentBlob = RawBlob::create(decodedBuffer.getBuffer(), decodedBuffer.getCount());
+
+ char const* decodedContentBegin = (char const*)m_contentBlob->getBufferPointer();
+ const UInt decodedContentSize = m_contentBlob->getBufferSize();
+ assert(decodedContentSize <= rawContentSize);
+ char const* decodedContentEnd = decodedContentBegin + decodedContentSize;
+
+ m_content = UnownedStringSlice(decodedContentBegin, decodedContentEnd);
}
void SourceFile::setContents(const String& content)