source/core/slang-char-encode.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

#include "slang-char-encode.h"

namespace Slang
{

class Utf8CharEncoding : public CharEncoding 
{
public:
    typedef CharEncoding Super;

	virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override
	{
        ioBuffer.addRange((const Byte*)slice.begin(), slice.getLength());
	}
	virtual void decode(const Byte* bytes, int length, List<char>& ioChars) override
	{
        ioChars.addRange((const char*)bytes, length);
	}
    Utf8CharEncoding() : Super(CharEncodeType::UTF8) {}
};

class Utf32CharEncoding : public CharEncoding
{
public:
    typedef CharEncoding Super;

	virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override
	{
		Index ptr = 0;
		while (ptr < slice.getLength())
		{
            const Char32 codePoint = getUnicodePointFromUTF8([&]() -> Byte
			{
				if (ptr < slice.getLength())
					return slice[ptr++];
				else
					return '\0';
			});
            // Note: Assumes byte order is same as arch byte order
            ioBuffer.addRange((const Byte*)&codePoint, 4);
		}
	}
	virtual void decode(const Byte* bytes, int length, List<char>& ioBuffer) override
	{
        // Note: Assumes bytes is Char32 aligned
        SLANG_ASSERT((size_t(bytes) & 3) == 0);
		const Char32* content = (const Char32*)bytes;
		for (int i = 0; i < (length >> 2); i++)
		{
			char buf[5];
			int count = encodeUnicodePointToUTF8(content[i], buf);
            for (int j = 0; j < count; j++)
                ioBuffer.addRange(buf, count);
		}
	}

    Utf32CharEncoding() : Super(CharEncodeType::UTF32) {}
};

class Utf16CharEncoding : public CharEncoding //UTF16
{
public:
    typedef CharEncoding Super;
	Utf16CharEncoding(bool reverseOrder):
        Super(reverseOrder ? CharEncodeType::UTF16Reversed : CharEncodeType::UTF16),
		m_reverseOrder(reverseOrder)
	{}
	virtual void encode(const UnownedStringSlice& slice, List<Byte>& ioBuffer) override
	{
		Index index = 0;
		while (index < slice.getLength())
		{
            const Char32 codePoint = getUnicodePointFromUTF8([&]() -> Byte
			{
				if (index < slice.getLength())
					return slice[index++];
				else
					return '\0';
			});

			Char16 buffer[2];
			int count;
			if (!m_reverseOrder)
				count = encodeUnicodePointToUTF16(codePoint, buffer);
			else
				count = encodeUnicodePointToUTF16Reversed(codePoint, buffer);
            ioBuffer.addRange((const Byte*)buffer, count * 2);
		}
	}
	virtual void decode(const Byte* bytes, int length, List<char>& ioBuffer) override
	{
		Index index = 0;
		while (index < length)
		{
			const Char32 codePoint = getUnicodePointFromUTF16([&]() -> Byte
			{
                if (index < length)
                    return bytes[index++];
                else
                    return Byte(0);
			});

			char buf[5];
			int count = encodeUnicodePointToUTF8(codePoint, buf);
            ioBuffer.addRange((const char*)buf, count);
		}
	}

private:
    bool m_reverseOrder = false;
};

/* static */CharEncodeType CharEncoding::determineEncoding(const Byte* bytes, size_t bytesCount, size_t& outOffset)
{
    // TODO(JS): Assumes the bytes are suitably aligned

    if (bytesCount >= 3 && bytes[0] == 0xef && bytes[1] == 0xbb && bytes[2] == 0xbf)
    {
        outOffset = 3;
        return CharEncodeType::UTF8;
    }
    else if (bytesCount >= 2)
    {
        Char16 c;
        ::memcpy(&c, bytes, 2);

        if (c == kUTF16Header)
        {
            outOffset = 2;
            return CharEncodeType::UTF16;
        }
        else if (c == kUTF16ReversedHeader)
        {
            outOffset = 2;
            return CharEncodeType::UTF16Reversed;
        }
    }
    else
    {
        // If we don't have a 'mark' byte then we are bit stumped. We'll look for a null bytes and assume they mean we have a 16 bit encoding
        for (size_t i = 0; i < bytesCount; i += 2)
        {
#if SLANG_LITTLE_ENDIAN
            const auto low = bytes[i];
            const auto high = bytes[i + 1];
#else
            const auto low = bytes[i + 1];
            const auto high = bytes[i];
#endif 
            if ((low == 0) ^ (high == 0))
            {
                outOffset = 2;
                return (high == 0) ? CharEncodeType::UTF16 : CharEncodeType::UTF16Reversed;
            }
        }
    }

    // Assume it's UTF8 or 7 bit ascii which UTF8 is a superset of
    outOffset = 0;
    return CharEncodeType::UTF8;
}

static Utf8CharEncoding _utf8Encoding;
static Utf16CharEncoding _utf16Encoding(false);
static Utf16CharEncoding _utf16EncodingReversed(true);
static Utf32CharEncoding _utf32Encoding;

/* static */CharEncoding* const CharEncoding::g_encoding[Index(CharEncodeType::CountOf)]
{
    &_utf8Encoding,             // UTF8,
    &_utf16Encoding,            // UTF16,
    &_utf16EncodingReversed,    // UTF16Reversed,
    &_utf32Encoding,            // UTF32,
};

CharEncoding* CharEncoding::UTF8 = &_utf8Encoding;
CharEncoding* CharEncoding::UTF16 = &_utf16Encoding;
CharEncoding* CharEncoding::UTF16Reversed = &_utf16EncodingReversed;
CharEncoding* CharEncoding::UTF32 = &_utf32Encoding;
	
} // namespace Slang