summaryrefslogtreecommitdiffstats
path: root/source/core/slang-char-encode.cpp
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2021-10-04 14:15:51 -0400
committerGitHub <noreply@github.com>2021-10-04 14:15:51 -0400
commit97bb82ebcdf8f1391b9d93b5a8d7b1dfc4e88e52 (patch)
treef120ba282cbea96d23ed179737984a4610d3b520 /source/core/slang-char-encode.cpp
parentb3dfe383c6d31ff3dbd76dcfb32de8d536382f3e (diff)
Removing exceptions from core/compiler-core (#1953)
* #include an absolute path didn't work - because paths were taken to always be relative. * Refactor Stream. Working on all tests. * Split out CharEncode. * Make method names lower camel. m_prefix in Writer/Reader * Tidy up around CharEncode interface. * Small improvements around encode/decode. * Better use of types. * Remove readLine from TextReader. * Remove exceptions from Stream/Text handling. * Fix some typos. * Fix tabbing. * Fix missing override. * Remove remaining exception throw/catch via using signal mechanism. * Remove exceptions that are not used anymore. * Document the Stream interface. * Remove index for decoding 'get byte' function. * Fix CharReader -> ByteReader.
Diffstat (limited to 'source/core/slang-char-encode.cpp')
-rw-r--r--source/core/slang-char-encode.cpp181
1 files changed, 181 insertions, 0 deletions
diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp
new file mode 100644
index 000000000..d061e34ba
--- /dev/null
+++ b/source/core/slang-char-encode.cpp
@@ -0,0 +1,181 @@
+#include "slang-char-encode.h"
+
+namespace Slang
+{
+
+class Utf8CharEncoding : public CharEncoding
+{
+public:
+ typedef CharEncoding Super;
+
+ virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override
+ {
+ ioBuffer.addRange((const Byte*)slice.begin(), slice.getLength());
+ }
+ virtual void decode(const Byte* bytes, int length, List<char>& ioChars) override
+ {
+ ioChars.addRange((const char*)bytes, length);
+ }
+ Utf8CharEncoding() : Super(CharEncodeType::UTF8) {}
+};
+
+class Utf32CharEncoding : public CharEncoding
+{
+public:
+ typedef CharEncoding Super;
+
+ virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override
+ {
+ Index ptr = 0;
+ while (ptr < slice.getLength())
+ {
+ const Char32 codePoint = getUnicodePointFromUTF8([&]() -> Byte
+ {
+ if (ptr < slice.getLength())
+ return slice[ptr++];
+ else
+ return '\0';
+ });
+ // Note: Assumes byte order is same as arch byte order
+ ioBuffer.addRange((const Byte*)&codePoint, 4);
+ }
+ }
+ virtual void decode(const Byte* bytes, int length, List<char>& ioBuffer) override
+ {
+ // Note: Assumes bytes is Char32 aligned
+ SLANG_ASSERT((size_t(bytes) & 3) == 0);
+ const Char32* content = (const Char32*)bytes;
+ for (int i = 0; i < (length >> 2); i++)
+ {
+ char buf[5];
+ int count = encodeUnicodePointToUTF8(content[i], buf);
+ for (int j = 0; j < count; j++)
+ ioBuffer.addRange(buf, count);
+ }
+ }
+
+ Utf32CharEncoding() : Super(CharEncodeType::UTF32) {}
+};
+
+class Utf16CharEncoding : public CharEncoding //UTF16
+{
+public:
+ typedef CharEncoding Super;
+ Utf16CharEncoding(bool reverseOrder):
+ Super(reverseOrder ? CharEncodeType::UTF16Reversed : CharEncodeType::UTF16),
+ m_reverseOrder(reverseOrder)
+ {}
+ virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override
+ {
+ Index index = 0;
+ while (index < slice.getLength())
+ {
+ const Char32 codePoint = getUnicodePointFromUTF8([&]() -> Byte
+ {
+ if (index < slice.getLength())
+ return slice[index++];
+ else
+ return '\0';
+ });
+
+ Char16 buffer[2];
+ int count;
+ if (!m_reverseOrder)
+ count = encodeUnicodePointToUTF16(codePoint, buffer);
+ else
+ count = encodeUnicodePointToUTF16Reversed(codePoint, buffer);
+ ioBuffer.addRange((const Byte*)buffer, count * 2);
+ }
+ }
+ virtual void decode(const Byte* bytes, int length, List<char>& ioBuffer) override
+ {
+ Index index = 0;
+ while (index < length)
+ {
+ const Char32 codePoint = getUnicodePointFromUTF16([&]() -> Byte
+ {
+ if (index < length)
+ return bytes[index++];
+ else
+ return Byte(0);
+ });
+
+ char buf[5];
+ int count = encodeUnicodePointToUTF8(codePoint, buf);
+ ioBuffer.addRange((const char*)buf, count);
+ }
+ }
+
+private:
+ bool m_reverseOrder = false;
+};
+
+/* static */CharEncodeType CharEncoding::determineEncoding(const Byte* bytes, size_t bytesCount, size_t& outOffset)
+{
+ // TODO(JS): Assumes the bytes are suitably aligned
+
+ if (bytesCount >= 3 && bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf)
+ {
+ outOffset = 3;
+ return CharEncodeType::UTF8;
+ }
+ else if (bytesCount >= 2)
+ {
+ Char16 c;
+ ::memcpy(&c, bytes, 2);
+
+ if (c == kUTF16Header)
+ {
+ outOffset = 2;
+ return CharEncodeType::UTF16;
+ }
+ else if (c == kUTF16ReversedHeader)
+ {
+ outOffset = 2;
+ return CharEncodeType::UTF16Reversed;
+ }
+ }
+ else
+ {
+ // If we don't have a 'mark' byte then we are bit stumped. We'll look for a null bytes and assume they mean we have a 16 bit encoding
+ for (size_t i = 0; i < bytesCount; i += 2)
+ {
+#if SLANG_LITTLE_ENDIAN
+ const auto low = bytes[i];
+ const auto high = bytes[i + 1];
+#else
+ const auto low = bytes[i + 1];
+ const auto high = bytes[i];
+#endif
+ if ((low == 0) ^ (high == 0))
+ {
+ outOffset = 2;
+ return (high == 0) ? CharEncodeType::UTF16 : CharEncodeType::UTF16Reversed;
+ }
+ }
+ }
+
+ // Assume it's UTF8 or 7 bit ascii which UTF8 is a superset of
+ outOffset = 0;
+ return CharEncodeType::UTF8;
+}
+
+static Utf8CharEncoding _utf8Encoding;
+static Utf16CharEncoding _utf16Encoding(false);
+static Utf16CharEncoding _utf16EncodingReversed(true);
+static Utf32CharEncoding _utf32Encoding;
+
+/* static */CharEncoding* const CharEncoding::g_encoding[Index(CharEncodeType::CountOf)]
+{
+ &_utf8Encoding, // UTF8,
+ &_utf16Encoding, // UTF16,
+ &_utf16EncodingReversed, // UTF16Reversed,
+ &_utf32Encoding, // UTF32,
+};
+
+CharEncoding* CharEncoding::UTF8 = &_utf8Encoding;
+CharEncoding* CharEncoding::UTF16 = &_utf16Encoding;
+CharEncoding* CharEncoding::UTF16Reversed = &_utf16EncodingReversed;
+CharEncoding* CharEncoding::UTF32 = &_utf32Encoding;
+
+} // namespace Slang