diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2021-10-04 14:15:51 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-10-04 14:15:51 -0400 |
| commit | 97bb82ebcdf8f1391b9d93b5a8d7b1dfc4e88e52 (patch) | |
| tree | f120ba282cbea96d23ed179737984a4610d3b520 /source/core/slang-char-encode.cpp | |
| parent | b3dfe383c6d31ff3dbd76dcfb32de8d536382f3e (diff) | |
Removing exceptions from core/compiler-core (#1953)
* #include an absolute path didn't work - because paths were taken to always be relative.
* Refactor Stream. Working on all tests.
* Split out CharEncode.
* Make method names lower camel.
m_prefix in Writer/Reader
* Tidy up around CharEncode interface.
* Small improvements around encode/decode.
* Better use of types.
* Remove readLine from TextReader.
* Remove exceptions from Stream/Text handling.
* Fix some typos.
* Fix tabbing.
* Fix missing override.
* Remove remaining exception throw/catch via using signal mechanism.
* Remove exceptions that are not used anymore.
* Document the Stream interface.
* Remove index for decoding 'get byte' function.
* Fix CharReader -> ByteReader.
Diffstat (limited to 'source/core/slang-char-encode.cpp')
| -rw-r--r-- | source/core/slang-char-encode.cpp | 181 |
1 files changed, 181 insertions, 0 deletions
diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp new file mode 100644 index 000000000..d061e34ba --- /dev/null +++ b/source/core/slang-char-encode.cpp @@ -0,0 +1,181 @@ +#include "slang-char-encode.h" + +namespace Slang +{ + +class Utf8CharEncoding : public CharEncoding +{ +public: + typedef CharEncoding Super; + + virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override + { + ioBuffer.addRange((const Byte*)slice.begin(), slice.getLength()); + } + virtual void decode(const Byte* bytes, int length, List<char>& ioChars) override + { + ioChars.addRange((const char*)bytes, length); + } + Utf8CharEncoding() : Super(CharEncodeType::UTF8) {} +}; + +class Utf32CharEncoding : public CharEncoding +{ +public: + typedef CharEncoding Super; + + virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override + { + Index ptr = 0; + while (ptr < slice.getLength()) + { + const Char32 codePoint = getUnicodePointFromUTF8([&]() -> Byte + { + if (ptr < slice.getLength()) + return slice[ptr++]; + else + return '\0'; + }); + // Note: Assumes byte order is same as arch byte order + ioBuffer.addRange((const Byte*)&codePoint, 4); + } + } + virtual void decode(const Byte* bytes, int length, List<char>& ioBuffer) override + { + // Note: Assumes bytes is Char32 aligned + SLANG_ASSERT((size_t(bytes) & 3) == 0); + const Char32* content = (const Char32*)bytes; + for (int i = 0; i < (length >> 2); i++) + { + char buf[5]; + int count = encodeUnicodePointToUTF8(content[i], buf); + for (int j = 0; j < count; j++) + ioBuffer.addRange(buf, count); + } + } + + Utf32CharEncoding() : Super(CharEncodeType::UTF32) {} +}; + +class Utf16CharEncoding : public CharEncoding //UTF16 +{ +public: + typedef CharEncoding Super; + Utf16CharEncoding(bool reverseOrder): + Super(reverseOrder ? CharEncodeType::UTF16Reversed : CharEncodeType::UTF16), + m_reverseOrder(reverseOrder) + {} + virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override + { + Index index = 0; + while (index < slice.getLength()) + { + const Char32 codePoint = getUnicodePointFromUTF8([&]() -> Byte + { + if (index < slice.getLength()) + return slice[index++]; + else + return '\0'; + }); + + Char16 buffer[2]; + int count; + if (!m_reverseOrder) + count = encodeUnicodePointToUTF16(codePoint, buffer); + else + count = encodeUnicodePointToUTF16Reversed(codePoint, buffer); + ioBuffer.addRange((const Byte*)buffer, count * 2); + } + } + virtual void decode(const Byte* bytes, int length, List<char>& ioBuffer) override + { + Index index = 0; + while (index < length) + { + const Char32 codePoint = getUnicodePointFromUTF16([&]() -> Byte + { + if (index < length) + return bytes[index++]; + else + return Byte(0); + }); + + char buf[5]; + int count = encodeUnicodePointToUTF8(codePoint, buf); + ioBuffer.addRange((const char*)buf, count); + } + } + +private: + bool m_reverseOrder = false; +}; + +/* static */CharEncodeType CharEncoding::determineEncoding(const Byte* bytes, size_t bytesCount, size_t& outOffset) +{ + // TODO(JS): Assumes the bytes are suitably aligned + + if (bytesCount >= 3 && bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf) + { + outOffset = 3; + return CharEncodeType::UTF8; + } + else if (bytesCount >= 2) + { + Char16 c; + ::memcpy(&c, bytes, 2); + + if (c == kUTF16Header) + { + outOffset = 2; + return CharEncodeType::UTF16; + } + else if (c == kUTF16ReversedHeader) + { + outOffset = 2; + return CharEncodeType::UTF16Reversed; + } + } + else + { + // If we don't have a 'mark' byte then we are bit stumped. We'll look for a null bytes and assume they mean we have a 16 bit encoding + for (size_t i = 0; i < bytesCount; i += 2) + { +#if SLANG_LITTLE_ENDIAN + const auto low = bytes[i]; + const auto high = bytes[i + 1]; +#else + const auto low = bytes[i + 1]; + const auto high = bytes[i]; +#endif + if ((low == 0) ^ (high == 0)) + { + outOffset = 2; + return (high == 0) ? CharEncodeType::UTF16 : CharEncodeType::UTF16Reversed; + } + } + } + + // Assume it's UTF8 or 7 bit ascii which UTF8 is a superset of + outOffset = 0; + return CharEncodeType::UTF8; +} + +static Utf8CharEncoding _utf8Encoding; +static Utf16CharEncoding _utf16Encoding(false); +static Utf16CharEncoding _utf16EncodingReversed(true); +static Utf32CharEncoding _utf32Encoding; + +/* static */CharEncoding* const CharEncoding::g_encoding[Index(CharEncodeType::CountOf)] +{ + &_utf8Encoding, // UTF8, + &_utf16Encoding, // UTF16, + &_utf16EncodingReversed, // UTF16Reversed, + &_utf32Encoding, // UTF32, +}; + +CharEncoding* CharEncoding::UTF8 = &_utf8Encoding; +CharEncoding* CharEncoding::UTF16 = &_utf16Encoding; +CharEncoding* CharEncoding::UTF16Reversed = &_utf16EncodingReversed; +CharEncoding* CharEncoding::UTF32 = &_utf32Encoding; + +} // namespace Slang |
