summaryrefslogtreecommitdiffstats
path: root/source/core/slang-char-encode.h
diff options
context:
space:
mode:
Diffstat (limited to 'source/core/slang-char-encode.h')
-rw-r--r--source/core/slang-char-encode.h200
1 files changed, 200 insertions, 0 deletions
diff --git a/source/core/slang-char-encode.h b/source/core/slang-char-encode.h
new file mode 100644
index 000000000..a778cc3c9
--- /dev/null
+++ b/source/core/slang-char-encode.h
@@ -0,0 +1,200 @@
+#ifndef SLANG_CORE_CHAR_ENCODE_H
+#define SLANG_CORE_CHAR_ENCODE_H
+
+#include "slang-secure-crt.h"
+#include "slang-basic.h"
+
+namespace Slang
+{
+
+// NOTE! Order must be kept the same to match up with
+enum class CharEncodeType
+{
+ UTF8,
+ UTF16,
+ UTF16Reversed,
+ UTF32,
+ CountOf,
+};
+
+template <typename ReadByteFunc>
+Char32 getUnicodePointFromUTF8(const ReadByteFunc& readByte)
+{
+ Char32 codePoint = 0;
+ uint32_t leading = Byte(readByte());
+ uint32_t mask = 0x80;
+ Index count = 0;
+ while (leading & mask)
+ {
+ count++;
+ mask >>= 1;
+ }
+ codePoint = (leading & (mask - 1));
+ for (Index i = 1; i <= count - 1; i++)
+ {
+ codePoint <<= 6;
+ codePoint += (readByte() & 0x3F);
+ }
+ return codePoint;
+}
+
+template <typename ReadByteFunc>
+Char32 getUnicodePointFromUTF16(const ReadByteFunc& readByte)
+{
+ uint32_t byte0 = Byte(readByte());
+ uint32_t byte1 = Byte(readByte());
+ uint32_t word0 = byte0 + (byte1 << 8);
+ if (word0 >= 0xD800 && word0 <= 0xDFFF)
+ {
+ uint32_t byte2 = Byte(readByte());
+ uint32_t byte3 = Byte(readByte());
+ uint32_t word1 = byte2 + (byte3 << 8);
+ return Char32(((word0 & 0x3FF) << 10) + (word1 & 0x3FF) + 0x10000);
+ }
+ else
+ return Char32(word0);
+}
+
+template <typename ReadByteFunc>
+Char32 getUnicodePointFromUTF16Reversed(const ReadByteFunc& readByte)
+{
+ uint32_t byte0 = Byte(readByte());
+ uint32_t byte1 = Byte(readByte());
+ uint32_t word0 = (byte0 << 8) + byte1;
+ if (word0 >= 0xD800 && word0 <= 0xDFFF)
+ {
+ uint32_t byte2 = Byte(readByte());
+ uint32_t byte3 = Byte(readByte());
+ uint32_t word1 = (byte2 << 8) + byte3;
+ return Char32(((word0 & 0x3FF) << 10) + (word1 & 0x3FF));
+ }
+ else
+ return Char32(word0);
+}
+
+template <typename ReadByteFunc>
+Char32 getUnicodePointFromUTF32(const ReadByteFunc& readByte)
+{
+ uint32_t byte0 = Byte(readByte());
+ uint32_t byte1 = Byte(readByte());
+ uint32_t byte2 = Byte(readByte());
+ uint32_t byte3 = Byte(readByte());
+ return Char32(byte0 + (byte1 << 8) + (byte2 << 16) + (byte3 << 24));
+}
+
+// Encode functions return the amount of elements output to the buffer
+inline int encodeUnicodePointToUTF8(Char32 codePoint, char* outBuffer)
+{
+ char* const dst = outBuffer;
+ // TODO(JS): This supports 4 + 6 * 3 = 22 bits.
+ // The standard allows up to 0x10FFFF.
+ if (codePoint <= 0x7F)
+ {
+ dst[0] = char(codePoint);
+ return 1;
+ }
+ else if (codePoint <= 0x7FF)
+ {
+ dst[0] = char(0xC0 + (codePoint >> 6));
+ dst[1] = char(0x80 + (codePoint & 0x3F));
+ return 2;
+ }
+ else if (codePoint <= 0xFFFF)
+ {
+ dst[0] = char(0xE0 + (codePoint >> 12));
+ dst[1] = char(0x80 + ((codePoint >> 6) & (0x3F)));
+ dst[2] = char(0x80 + (codePoint & 0x3F));
+ return 3;
+ }
+ else
+ {
+ dst[0] = char(0xF0 + (codePoint >> 18));
+ dst[1] = char(0x80 + ((codePoint >> 12) & 0x3F));
+ dst[2] = char(0x80 + ((codePoint >> 6) & 0x3F));
+ dst[3] = char(0x80 + (codePoint & 0x3F));
+ return 4;
+ }
+}
+
+inline int encodeUnicodePointToUTF16(Char32 codePoint, Char16* outBuffer)
+{
+ Char16* const dst = outBuffer;
+ if (codePoint <= 0xD7FF || (codePoint >= 0xE000 && codePoint <= 0xFFFF))
+ {
+ dst[0] = Char16(codePoint);
+ return 1;
+ }
+ else
+ {
+ const uint32_t sub = codePoint - 0x10000;
+ dst[0] = Char16((sub >> 10) + 0xD800);
+ dst[1] = Char16((sub & 0x3FF) + 0xDC00);
+ return 2;
+ }
+}
+
+SLANG_FORCE_INLINE Char16 reverseByteOrder(Char16 val)
+{
+ return (val >> 8) | (val << 8);
+}
+
+inline int encodeUnicodePointToUTF16Reversed(Char32 codePoint, Char16* outBuffer)
+{
+ Char16* const dst = outBuffer;
+ if (codePoint <= 0xD7FF || (codePoint >= 0xE000 && codePoint <= 0xFFFF))
+ {
+ dst[0] = reverseByteOrder(Char16(codePoint));
+ return 1;
+ }
+ else
+ {
+ const uint32_t sub = codePoint - 0x10000;
+ const uint32_t high = (sub >> 10) + 0xD800;
+ const uint32_t low = (sub & 0x3FF) + 0xDC00;
+ dst[0] = reverseByteOrder(Char16(high));
+ dst[1] = reverseByteOrder(Char16(low));
+ return 2;
+ }
+}
+
+static const Char16 kUTF16Header = 0xFEFF;
+static const Char16 kUTF16ReversedHeader = 0xFFFE;
+
+class CharEncoding
+{
+public:
+ static CharEncoding* UTF8,* UTF16,* UTF16Reversed,* UTF32;
+
+ /// Encode Utf8 held in slice append into ioBuffer
+ virtual void encode(const UnownedStringSlice& str, List<Byte>& ioBuffer) = 0;
+ /// Decode buffer into Utf8 held in ioBuffer
+ virtual void decode(const Byte* buffer, int length, List<char>& ioBuffer) = 0;
+
+ virtual ~CharEncoding() {}
+
+ /// Get the encoding type
+ CharEncodeType getEncodingType() const { return m_encodingType; }
+
+ /// Given some bytes determines a character encoding type, based on the initial bytes.
+ /// If can't be determined will assume UTF8.
+ /// Outputs the offset to the first non mark in outOffset
+ static CharEncodeType determineEncoding(const Byte* bytes, size_t bytesCount, size_t& outOffset);
+
+ /// Get the
+ static CharEncoding* getEncoding(CharEncodeType type) { return g_encoding[Index(type)]; }
+
+ CharEncoding(CharEncodeType encodingType) :
+ m_encodingType(encodingType)
+ {
+ }
+
+protected:
+
+ CharEncodeType m_encodingType;
+
+ static CharEncoding*const g_encoding[Index(CharEncodeType::CountOf)];
+};
+
+}
+
+#endif