diff options
Diffstat (limited to 'source/core/slang-char-encode.h')
| -rw-r--r-- | source/core/slang-char-encode.h | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/source/core/slang-char-encode.h b/source/core/slang-char-encode.h new file mode 100644 index 000000000..a778cc3c9 --- /dev/null +++ b/source/core/slang-char-encode.h @@ -0,0 +1,200 @@ +#ifndef SLANG_CORE_CHAR_ENCODE_H +#define SLANG_CORE_CHAR_ENCODE_H + +#include "slang-secure-crt.h" +#include "slang-basic.h" + +namespace Slang +{ + +// NOTE! Order must be kept the same to match up with +enum class CharEncodeType +{ + UTF8, + UTF16, + UTF16Reversed, + UTF32, + CountOf, +}; + +template <typename ReadByteFunc> +Char32 getUnicodePointFromUTF8(const ReadByteFunc& readByte) +{ + Char32 codePoint = 0; + uint32_t leading = Byte(readByte()); + uint32_t mask = 0x80; + Index count = 0; + while (leading & mask) + { + count++; + mask >>= 1; + } + codePoint = (leading & (mask - 1)); + for (Index i = 1; i <= count - 1; i++) + { + codePoint <<= 6; + codePoint += (readByte() & 0x3F); + } + return codePoint; +} + +template <typename ReadByteFunc> +Char32 getUnicodePointFromUTF16(const ReadByteFunc& readByte) +{ + uint32_t byte0 = Byte(readByte()); + uint32_t byte1 = Byte(readByte()); + uint32_t word0 = byte0 + (byte1 << 8); + if (word0 >= 0xD800 && word0 <= 0xDFFF) + { + uint32_t byte2 = Byte(readByte()); + uint32_t byte3 = Byte(readByte()); + uint32_t word1 = byte2 + (byte3 << 8); + return Char32(((word0 & 0x3FF) << 10) + (word1 & 0x3FF) + 0x10000); + } + else + return Char32(word0); +} + +template <typename ReadByteFunc> +Char32 getUnicodePointFromUTF16Reversed(const ReadByteFunc& readByte) +{ + uint32_t byte0 = Byte(readByte()); + uint32_t byte1 = Byte(readByte()); + uint32_t word0 = (byte0 << 8) + byte1; + if (word0 >= 0xD800 && word0 <= 0xDFFF) + { + uint32_t byte2 = Byte(readByte()); + uint32_t byte3 = Byte(readByte()); + uint32_t word1 = (byte2 << 8) + byte3; + return Char32(((word0 & 0x3FF) << 10) + (word1 & 0x3FF)); + } + else + return Char32(word0); +} + +template <typename ReadByteFunc> +Char32 getUnicodePointFromUTF32(const ReadByteFunc& readByte) +{ + uint32_t byte0 = Byte(readByte()); + uint32_t byte1 = Byte(readByte()); + uint32_t byte2 = Byte(readByte()); + uint32_t byte3 = Byte(readByte()); + return Char32(byte0 + (byte1 << 8) + (byte2 << 16) + (byte3 << 24)); +} + +// Encode functions return the amount of elements output to the buffer +inline int encodeUnicodePointToUTF8(Char32 codePoint, char* outBuffer) +{ + char* const dst = outBuffer; + // TODO(JS): This supports 4 + 6 * 3 = 22 bits. + // The standard allows up to 0x10FFFF. + if (codePoint <= 0x7F) + { + dst[0] = char(codePoint); + return 1; + } + else if (codePoint <= 0x7FF) + { + dst[0] = char(0xC0 + (codePoint >> 6)); + dst[1] = char(0x80 + (codePoint & 0x3F)); + return 2; + } + else if (codePoint <= 0xFFFF) + { + dst[0] = char(0xE0 + (codePoint >> 12)); + dst[1] = char(0x80 + ((codePoint >> 6) & (0x3F))); + dst[2] = char(0x80 + (codePoint & 0x3F)); + return 3; + } + else + { + dst[0] = char(0xF0 + (codePoint >> 18)); + dst[1] = char(0x80 + ((codePoint >> 12) & 0x3F)); + dst[2] = char(0x80 + ((codePoint >> 6) & 0x3F)); + dst[3] = char(0x80 + (codePoint & 0x3F)); + return 4; + } +} + +inline int encodeUnicodePointToUTF16(Char32 codePoint, Char16* outBuffer) +{ + Char16* const dst = outBuffer; + if (codePoint <= 0xD7FF || (codePoint >= 0xE000 && codePoint <= 0xFFFF)) + { + dst[0] = Char16(codePoint); + return 1; + } + else + { + const uint32_t sub = codePoint - 0x10000; + dst[0] = Char16((sub >> 10) + 0xD800); + dst[1] = Char16((sub & 0x3FF) + 0xDC00); + return 2; + } +} + +SLANG_FORCE_INLINE Char16 reverseByteOrder(Char16 val) +{ + return (val >> 8) | (val << 8); +} + +inline int encodeUnicodePointToUTF16Reversed(Char32 codePoint, Char16* outBuffer) +{ + Char16* const dst = outBuffer; + if (codePoint <= 0xD7FF || (codePoint >= 0xE000 && codePoint <= 0xFFFF)) + { + dst[0] = reverseByteOrder(Char16(codePoint)); + return 1; + } + else + { + const uint32_t sub = codePoint - 0x10000; + const uint32_t high = (sub >> 10) + 0xD800; + const uint32_t low = (sub & 0x3FF) + 0xDC00; + dst[0] = reverseByteOrder(Char16(high)); + dst[1] = reverseByteOrder(Char16(low)); + return 2; + } +} + +static const Char16 kUTF16Header = 0xFEFF; +static const Char16 kUTF16ReversedHeader = 0xFFFE; + +class CharEncoding +{ +public: + static CharEncoding* UTF8,* UTF16,* UTF16Reversed,* UTF32; + + /// Encode Utf8 held in slice append into ioBuffer + virtual void encode(const UnownedStringSlice& str, List<Byte>& ioBuffer) = 0; + /// Decode buffer into Utf8 held in ioBuffer + virtual void decode(const Byte* buffer, int length, List<char>& ioBuffer) = 0; + + virtual ~CharEncoding() {} + + /// Get the encoding type + CharEncodeType getEncodingType() const { return m_encodingType; } + + /// Given some bytes determines a character encoding type, based on the initial bytes. + /// If can't be determined will assume UTF8. + /// Outputs the offset to the first non mark in outOffset + static CharEncodeType determineEncoding(const Byte* bytes, size_t bytesCount, size_t& outOffset); + + /// Get the + static CharEncoding* getEncoding(CharEncodeType type) { return g_encoding[Index(type)]; } + + CharEncoding(CharEncodeType encodingType) : + m_encodingType(encodingType) + { + } + +protected: + + CharEncodeType m_encodingType; + + static CharEncoding*const g_encoding[Index(CharEncodeType::CountOf)]; +}; + +} + +#endif |
