summaryrefslogtreecommitdiffstats
path: root/source/core
diff options
context:
space:
mode:
Diffstat (limited to 'source/core')
-rw-r--r--source/core/slang-char-encode.cpp34
-rw-r--r--source/core/slang-char-encode.h10
-rw-r--r--source/core/slang-char-util.h3
3 files changed, 47 insertions, 0 deletions
diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp
index d061e34ba..687040fa2 100644
--- a/source/core/slang-char-encode.cpp
+++ b/source/core/slang-char-encode.cpp
@@ -178,4 +178,38 @@ CharEncoding* CharEncoding::UTF16 = &_utf16Encoding;
CharEncoding* CharEncoding::UTF16Reversed = &_utf16EncodingReversed;
CharEncoding* CharEncoding::UTF32 = &_utf32Encoding;
+/* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! UTF8Util !!!!!!!!!!!!!!!!!!!!!!!!! */
+
+/* static */Index UTF8Util::calcCodePointCount(const UnownedStringSlice& in)
+{
+ Index count = 0;
+
+ // Analyse with bytes...
+ const int8_t* cur = (const int8_t*)in.begin();
+ const int8_t*const end = (const int8_t*)in.end();
+
+ while (cur < end)
+ {
+ const auto c = *cur++;
+
+ count++;
+
+ // If c < 0 it means the top bit is set... which means we have multiple bytes
+ if (c < 0)
+ {
+ // https://en.wikipedia.org/wiki/UTF-8
+ // All continuation bytes contain exactly six bits from the code point.So the next six bits of the code point
+ /// are stored in the low order six bits of the next byte, and 10 is stored in the high order two bits to
+ // mark it as a continuation byte(so 10000010).
+
+ while (cur < end && (*cur & 0xc0) == 0x80)
+ {
+ cur++;
+ }
+ }
+ }
+
+ return count;
+}
+
} // namespace Slang
diff --git a/source/core/slang-char-encode.h b/source/core/slang-char-encode.h
index a778cc3c9..2bb4cba29 100644
--- a/source/core/slang-char-encode.h
+++ b/source/core/slang-char-encode.h
@@ -195,6 +195,16 @@ protected:
static CharEncoding*const g_encoding[Index(CharEncodeType::CountOf)];
};
+struct UTF8Util
+{
+ /// Given a slice calculate the number of code points (unicode chars)
+ ///
+ /// NOTE! This doesn't check the *validity* of code points/encoding.
+ /// Non valid utf8 input or ending starting in partial characters, will produce
+ /// undefined results without error.
+ static Index calcCodePointCount(const UnownedStringSlice& in);
+};
+
}
#endif
diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h
index 40abee602..1ed8f7f73 100644
--- a/source/core/slang-char-util.h
+++ b/source/core/slang-char-util.h
@@ -47,6 +47,9 @@ struct CharUtil
/// Given a character return the upper case equivalent
SLANG_FORCE_INLINE static char toUpper(char c) { return (c >= 'a' && c <= 'z') ? (c -'a' + 'A') : c; }
+ /// Given a value between 0-15 inclusive returns the hex digit. Uses lower case hex.
+ SLANG_FORCE_INLINE static char getHexChar(Index i) { SLANG_ASSERT((i & ~Index(0xf)) == 0); return char(i >= 10 ? (i - 10 + 'a') : (i + '0')); }
+
/// Returns the value if c interpretted as a hex digit
/// If c is not a valid hex returns -1
inline static int getHexDigitValue(char c);