Support unicode identifier names. (#4772)

* Support unicode identifier names. * Fix. * Fix language server. * Fix build errors. * Fix. * Fix offset translation in language server.
author: Yong He <yonghe@outlook.com> 2024-08-12 20:53:03 -0700
committer: GitHub <noreply@github.com> 2024-08-12 20:53:03 -0700
commit: b390566b55700582321b09b72c726b8dff9bd819 (patch)
tree: a2fd8e50fcbde29dd2651e08a78021f2ae9d72de /source
parent: 20bd48659d0009de5477380c335e2419f4c66f8b (diff)
9 files changed, 159 insertions, 57 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index 7d84ed938..366af9114 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -5,9 +5,9 @@
 // input bytes and turning it into semantically useful tokens.
 //
 
+#include "core/slang-char-encode.h"
 #include "slang-name.h"
 #include "slang-source-loc.h"
-
 #include "slang-core-diagnostics.h"
 
 namespace Slang
@@ -205,7 +205,6 @@ namespace Slang
                         c = e;
                     continue;
                 }
-
                 default:
                     break;
                 }
@@ -214,8 +213,12 @@ namespace Slang
                 // some newlines
                 break;
             }
-            // TODO: handle UTF-8 encoding for non-ASCII code points here
-
+            if (isUtf8LeadingByte((Byte)c))
+            {
+                // Consume all unicode characters.
+                pos--;
+                c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; });
+            }
             // Default case is to just hand along the byte we read as an ASCII code point.
         } while (offset--);
 
@@ -262,7 +265,12 @@ namespace Slang
                 }
             }
 
-            // TODO: Need to handle non-ASCII code points.
+            // Consume all unicode characters.
+            if (isUtf8LeadingByte((Byte)c))
+            {
+                lexer->m_cursor--;
+                c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; });
+            }
 
             // Default case is to return the raw byte we saw.
             return c;
@@ -340,6 +348,11 @@ namespace Slang
         }
     }
 
+    static bool isNonAsciiCodePoint(unsigned int codePoint)
+    {
+        return codePoint != 0xFFFFFFFF && codePoint >= 0x80;
+    }
+
     static void _lexIdentifier(Lexer* lexer)
     {
         for(;;)
@@ -348,12 +361,12 @@ namespace Slang
             if(('a' <= c ) && (c <= 'z')
                 || ('A' <= c) && (c <= 'Z')
                 || ('0' <= c) && (c <= '9')
-                || (c == '_'))
+                || (c == '_')
+                || isNonAsciiCodePoint((unsigned int)c))
             {
                 _advance(lexer);
                 continue;
             }
-
             return;
         }
     }
@@ -1052,7 +1065,8 @@ namespace Slang
 
     static TokenType _lexTokenImpl(Lexer* lexer)
     {
-        switch(_peek(lexer))
+        int nextCodePoint = _peek(lexer);
+        switch(nextCodePoint)
         {
         default:
             break;
@@ -1358,10 +1372,12 @@ namespace Slang
 
         }
 
-        // TODO(tfoley): If we ever wanted to support proper Unicode
-        // in identifiers, etc., then this would be the right place
-        // to perform a more expensive dispatch based on the actual
-        // code point (and not just the first byte).
+        // We treat all unicode characters as a part of an identifier.
+        if (isNonAsciiCodePoint(nextCodePoint))
+        {
+            _lexIdentifier(lexer);
+            return TokenType::Identifier;
+        }
 
         {
             // If none of the above cases matched, then we have an
diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp
index 105cfac7f..526c6c923 100644
--- a/source/core/slang-char-encode.cpp
+++ b/source/core/slang-char-encode.cpp
@@ -211,4 +211,27 @@ CharEncoding* CharEncoding::UTF32 = &_utf32Encoding;
     return count;
 }
 
+Index UTF8Util::calcUTF16CharCount(const UnownedStringSlice& in)
+{
+    Index count = 0;
+    Index readPtr = 0;
+    for (;;)
+    {
+        int c = getUnicodePointFromUTF8([&]() -> Byte
+            {
+                if (readPtr < in.getLength())
+                    return in[readPtr++];
+                else
+                    return 0;
+            });
+        if (c == 0)
+            break;
+        Char16 buffer[2];
+        count += encodeUnicodePointToUTF16(c, buffer);
+        if (readPtr >= in.getLength())
+            break;
+    }
+    return count;
+}
+
 } // namespace Slang
diff --git a/source/core/slang-char-encode.h b/source/core/slang-char-encode.h
index 2bb4cba29..a7cd501ab 100644
--- a/source/core/slang-char-encode.h
+++ b/source/core/slang-char-encode.h
@@ -203,6 +203,10 @@ struct UTF8Util
         /// Non valid utf8 input or ending starting in partial characters, will produce 
         /// undefined results without error.
     static Index calcCodePointCount(const UnownedStringSlice& in);
+
+
+        /// Given a slice in UTF8, calculate the number of UTF16 characters needed to represent the string.
+    static Index calcUTF16CharCount(const UnownedStringSlice& in);
 };
 
 }
diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h
index c65f676c4..88af24426 100644
--- a/source/core/slang-char-util.h
+++ b/source/core/slang-char-util.h
@@ -61,7 +61,7 @@ struct CharUtil
         /// Returns the value if c interpretted as a octal digit
         /// If c is not a valid octal returns -1
     inline static int getOctalDigitValue(char c) { return isOctalDigit(c) ? (c - '0') : -1; }
-    
+
     struct CharFlagMap
     {
         Flags flags[0x100];
diff --git a/source/core/slang-std-writers.cpp b/source/core/slang-std-writers.cpp
index a23d878fb..264f37c98 100644
--- a/source/core/slang-std-writers.cpp
+++ b/source/core/slang-std-writers.cpp
@@ -1,6 +1,10 @@
 
 #include "slang-std-writers.h"
 
+#if SLANG_WINDOWS_FAMILY
+#include <Windows.h>
+#endif
+
 namespace Slang
 {
 
@@ -8,8 +12,11 @@ namespace Slang
 
 /* static */RefPtr<StdWriters> StdWriters::createDefault()
 {
+#if SLANG_WINDOWS_FAMILY
+    SetConsoleCP(CP_UTF8);
+    SetConsoleOutputCP(CP_UTF8);
+#endif
     RefPtr<StdWriters> stdWriters(new StdWriters);
-
     RefPtr<FileWriter> stdError(new FileWriter(stderr, WriterFlag::AutoFlush | WriterFlag::IsUnowned));
     RefPtr<FileWriter> stdOut(new FileWriter(stdout, WriterFlag::AutoFlush | WriterFlag::IsUnowned));
 
diff --git a/source/slang/slang-language-server-document-symbols.cpp b/source/slang/slang-language-server-document-symbols.cpp
index ec9b434eb..45a76e97b 100644
--- a/source/slang/slang-language-server-document-symbols.cpp
+++ b/source/slang/slang-language-server-document-symbols.cpp
@@ -167,7 +167,7 @@ namespace Slang
                 sym.selectionRange.start.line = (int)line;
                 sym.selectionRange.start.character = (int)col;
                 sym.selectionRange.end.line = (int)line;
-                sym.selectionRange.end.character = (int)(col + nameLoc.name->text.getLength());
+                sym.selectionRange.end.character = (int)(col + (int)UTF8Util::calcUTF16CharCount(nameLoc.name->text.getUnownedSlice()));
                 sym.range.start.line = (int)line;
                 sym.range.start.character = 0;
                 sym.range.end.line = (int)line;
diff --git a/source/slang/slang-language-server.cpp b/source/slang/slang-language-server.cpp
index 09b14932c..ed03a5dc4 100644
--- a/source/slang/slang-language-server.cpp
+++ b/source/slang/slang-language-server.cpp
@@ -654,9 +654,12 @@ SlangResult LanguageServer::hover(
             maybeAppendAdditionalOverloadsHint();
             auto nodeHumaneLoc =
                 version->linkage->getSourceManager()->getHumaneLoc(leafNode->loc);
-            hover.range.start.line = int(nodeHumaneLoc.line - 1);
-            hover.range.end.line = int(nodeHumaneLoc.line - 1);
-            hover.range.start.character = int(nodeHumaneLoc.column - 1);
+            doc->oneBasedUTF8LocToZeroBasedUTF16Loc(
+                nodeHumaneLoc.line,
+                nodeHumaneLoc.column,
+                hover.range.start.line,
+                hover.range.start.character);
+            hover.range.end = hover.range.start;
             auto name = declRef.getName();
             if (auto ctorDecl = declRef.as<ConstructorDecl>())
             {
@@ -668,17 +671,19 @@ SlangResult LanguageServer::hover(
             }
             if (name)
             {
-                hover.range.end.character = int(nodeHumaneLoc.column + name->text.getLength() - 1);
+                hover.range.end.character = hover.range.start.character + (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice());
             }
         }
     };
     auto fillLoc = [&](SourceLoc loc)
     {
         auto humaneLoc = version->linkage->getSourceManager()->getHumaneLoc(loc, SourceLocType::Actual);
-        hover.range.start.line = int(humaneLoc.line - 1);
-        hover.range.end.line = int(humaneLoc.line - 1);
-        hover.range.start.character = int(humaneLoc.column - 1);
-        hover.range.end.character = hover.range.start.character + int(doc->getTokenLength(humaneLoc.line, humaneLoc.column));
+        doc->oneBasedUTF8LocToZeroBasedUTF16Loc(humaneLoc.line, humaneLoc.column, hover.range.start.line, hover.range.start.character);
+        doc->oneBasedUTF8LocToZeroBasedUTF16Loc(
+            humaneLoc.line,
+            humaneLoc.column + doc->getTokenLength(humaneLoc.line, humaneLoc.column),
+            hover.range.end.line,
+            hover.range.end.character);
     };
     auto fillExprHoverInfo = [&](Expr* expr)
     {
@@ -851,7 +856,7 @@ SlangResult LanguageServer::gotoDefinition(
                                                                 : declRefExpr->declRef.getLoc(),
                     SourceLocType::Actual);
             auto name = declRefExpr->declRef.getName();
-            locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0});
+            locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0});
         }
     }
     else if (auto overloadedExpr = as<OverloadedExpr>(leafNode))
@@ -863,7 +868,7 @@ SlangResult LanguageServer::gotoDefinition(
                 auto location = version->linkage->getSourceManager()->getHumaneLoc(
                     item.declRef.getNameLoc(), SourceLocType::Actual);
                 auto name = item.declRef.getName();
-                locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0});
+                locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0});
             }
         }
         else 
@@ -874,7 +879,7 @@ SlangResult LanguageServer::gotoDefinition(
                 auto location = version->linkage->getSourceManager()->getHumaneLoc(
                     item.declRef.getNameLoc(), SourceLocType::Actual);
                 auto name = item.declRef.getName();
-                locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0});
+                locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0});
             }
         }
     }
@@ -909,8 +914,11 @@ SlangResult LanguageServer::gotoDefinition(
             {
                 result.uri =
                     URI::fromLocalFilePath(loc.loc.pathInfo.foundPath.getUnownedSlice()).uri;
-                result.range.start.line = int(loc.loc.line - 1);
-                result.range.start.character = int(loc.loc.column - 1);
+                doc->oneBasedUTF8LocToZeroBasedUTF16Loc(
+                    loc.loc.line,
+                    loc.loc.column,
+                    result.range.start.line,
+                    result.range.start.character);
                 result.range.end = result.range.start;
                 result.range.end.character += loc.length;
                 results.add(result);
diff --git a/source/slang/slang-workspace-version.cpp b/source/slang/slang-workspace-version.cpp
index a07ef75cc..d85724328 100644
--- a/source/slang/slang-workspace-version.cpp
+++ b/source/slang/slang-workspace-version.cpp
@@ -396,42 +396,66 @@ void DocumentVersion::setText(const String& newText)
 {
     text = newText;
     StringUtil::calcLines(text.getUnownedSlice(), lines);
-    utf16CharStarts.clear();
+    mapUTF16CharIndexToCodePointIndex.clear();
+    mapCodePointIndexToUTF8ByteOffset.clear();
 }
-ArrayView<Index> DocumentVersion::getUTF16Boundaries(Index line)
+
+void DocumentVersion::ensureUTFBoundsAvailable()
 {
-    if (!utf16CharStarts.getCount())
+    for (auto slice : lines)
     {
-        for (auto slice : lines)
+        List<Index> bounds;
+        List<Index> utf8Bounds;
+        Index index = 0;
+        Index codePointIndex = 0;
+        while (index < slice.getLength())
         {
-            List<Index> bounds;
-            Index index = 0;
-            while (index < slice.getLength())
-            {
-                auto startIndex = index;
-                const Char32 codePoint = getUnicodePointFromUTF8(
-                    [&]() -> Byte
-                    {
-                        if (index < slice.getLength())
-                            return slice[index++];
-                        else
-                            return '\0';
-                    });
-                if (!codePoint)
-                    break;
-                Char16 buffer[2];
-                int count = encodeUnicodePointToUTF16Reversed(codePoint, buffer);
-                for (int i = 0; i < count; i++)
-                    bounds.add(startIndex);
-            }
-            bounds.add(slice.getLength());
-            utf16CharStarts.add(_Move(bounds));
+            auto startIndex = index;
+            const Char32 codePoint = getUnicodePointFromUTF8(
+                [&]() -> Byte
+                {
+                    if (index < slice.getLength())
+                        return slice[index++];
+                    else
+                        return '\0';
+                });
+            if (!codePoint)
+                break;
+
+            Char16 buffer[2];
+            int count = encodeUnicodePointToUTF16Reversed(codePoint, buffer);
+            for (int i = 0; i < count; i++)
+                bounds.add(codePointIndex);
+            utf8Bounds.add(startIndex);
+            codePointIndex++;
         }
+        bounds.add(slice.getLength());
+        utf8Bounds.add(slice.getLength());
+        mapUTF16CharIndexToCodePointIndex.add(_Move(bounds));
+        mapCodePointIndexToUTF8ByteOffset.add(_Move(utf8Bounds));
     }
-    return line >= 1 && line <= utf16CharStarts.getCount() ? utf16CharStarts[line - 1].getArrayView()
+}
+
+ArrayView<Index> DocumentVersion::getUTF16Boundaries(Index line)
+{
+    if (!mapUTF16CharIndexToCodePointIndex.getCount())
+    {
+        ensureUTFBoundsAvailable();
+    }
+    return line >= 1 && line <= mapUTF16CharIndexToCodePointIndex.getCount() ? mapUTF16CharIndexToCodePointIndex[line - 1].getArrayView()
                                                            : ArrayView<Index>();
 }
 
+ArrayView<Index> DocumentVersion::getUTF8Boundaries(Index line)
+{
+    if (!mapCodePointIndexToUTF8ByteOffset.getCount())
+    {
+        ensureUTFBoundsAvailable();
+    }
+    return line >= 1 && line <= mapCodePointIndexToUTF8ByteOffset.getCount() ? mapCodePointIndexToUTF8ByteOffset[line - 1].getArrayView()
+        : ArrayView<Index>();
+}
+
 void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc(
     Index inLine, Index inCol, Index& outLine, Index& outCol)
 {
@@ -447,6 +471,15 @@ void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc(
     outCol = std::lower_bound(bounds.begin(), bounds.end(), inCol - 1) - bounds.begin();
 }
 
+void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc(
+    Index inLine, Index inCol, int& outLine, int& outCol)
+{
+    Index ioutLine, ioutCol;
+    oneBasedUTF8LocToZeroBasedUTF16Loc(inLine, inCol, ioutLine, ioutCol);
+    outLine = (int)ioutLine;
+    outCol = (int)ioutCol;
+}
+
 void DocumentVersion::zeroBasedUTF16LocToOneBasedUTF8Loc(
     Index inLine, Index inCol, Index& outLine, Index& outCol)
 {
diff --git a/source/slang/slang-workspace-version.h b/source/slang/slang-workspace-version.h
index 44ab6b43c..d6cbfe6c5 100644
--- a/source/slang/slang-workspace-version.h
+++ b/source/slang/slang-workspace-version.h
@@ -20,7 +20,8 @@ namespace Slang
         String path;
         String text;
         List<UnownedStringSlice> lines;
-        List<List<Index>> utf16CharStarts;
+        List<List<Index>> mapUTF16CharIndexToCodePointIndex;
+        List<List<Index>> mapCodePointIndexToUTF8ByteOffset;
     public:
         void setPath(String filePath)
         {
@@ -32,10 +33,14 @@ namespace Slang
         const String& getText() { return text; }
         void setText(const String& newText);
 
+        void ensureUTFBoundsAvailable();
         ArrayView<Index> getUTF16Boundaries(Index line);
+        ArrayView<Index> getUTF8Boundaries(Index line);
 
         void oneBasedUTF8LocToZeroBasedUTF16Loc(
             Index inLine, Index inCol, Index& outLine, Index& outCol);
+        void oneBasedUTF8LocToZeroBasedUTF16Loc(
+            Index inLine, Index inCol, int& outLine, int& outCol);
         void zeroBasedUTF16LocToOneBasedUTF8Loc(
             Index inLine, Index inCol, Index& outLine, Index& outCol);
 
@@ -60,7 +65,11 @@ namespace Slang
                 return -1;
 
             Index lineStart = lineIndex >= 1 ? getLineStart(lines[lineIndex - 1]) : 0;
-            return lineStart + colIndex - 1;
+            auto boundaries = getUTF8Boundaries(lineIndex);
+            Index byteOffset = 0;
+            if (colIndex > 0 && colIndex <= boundaries.getCount())
+                byteOffset = boundaries[colIndex - 1];
+            return lineStart + byteOffset;
         }
 
         // Get 1-based, utf-8 encoding location from offset.
@@ -81,6 +90,8 @@ namespace Slang
             {
                 col = Index(offset - getLineStart(lines[line-1])) + 1;
             }
+            if (line > 0 && line <= lines.getCount())
+                col = UTF8Util::calcCodePointCount(lines[line-1].head(col));
         }
 
         // Get line from 1-based index.
author	Yong He <yonghe@outlook.com>	2024-08-12 20:53:03 -0700
committer	GitHub <noreply@github.com>	2024-08-12 20:53:03 -0700
commit	b390566b55700582321b09b72c726b8dff9bd819 (patch)
tree	a2fd8e50fcbde29dd2651e08a78021f2ae9d72de /source
parent	20bd48659d0009de5477380c335e2419f4c66f8b (diff)