Support unicode identifier names. (#4772)

* Support unicode identifier names. * Fix. * Fix language server. * Fix build errors. * Fix. * Fix offset translation in language server.
author: Yong He <yonghe@outlook.com> 2024-08-12 20:53:03 -0700
committer: GitHub <noreply@github.com> 2024-08-12 20:53:03 -0700
commit: b390566b55700582321b09b72c726b8dff9bd819 (patch)
tree: a2fd8e50fcbde29dd2651e08a78021f2ae9d72de /source/compiler-core
parent: 20bd48659d0009de5477380c335e2419f4c66f8b (diff)
1 files changed, 28 insertions, 12 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index 7d84ed938..366af9114 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -5,9 +5,9 @@
 // input bytes and turning it into semantically useful tokens.
 //
 
+#include "core/slang-char-encode.h"
 #include "slang-name.h"
 #include "slang-source-loc.h"
-
 #include "slang-core-diagnostics.h"
 
 namespace Slang
@@ -205,7 +205,6 @@ namespace Slang
                         c = e;
                     continue;
                 }
-
                 default:
                     break;
                 }
@@ -214,8 +213,12 @@ namespace Slang
                 // some newlines
                 break;
             }
-            // TODO: handle UTF-8 encoding for non-ASCII code points here
-
+            if (isUtf8LeadingByte((Byte)c))
+            {
+                // Consume all unicode characters.
+                pos--;
+                c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; });
+            }
             // Default case is to just hand along the byte we read as an ASCII code point.
         } while (offset--);
 
@@ -262,7 +265,12 @@ namespace Slang
                 }
             }
 
-            // TODO: Need to handle non-ASCII code points.
+            // Consume all unicode characters.
+            if (isUtf8LeadingByte((Byte)c))
+            {
+                lexer->m_cursor--;
+                c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; });
+            }
 
             // Default case is to return the raw byte we saw.
             return c;
@@ -340,6 +348,11 @@ namespace Slang
         }
     }
 
+    static bool isNonAsciiCodePoint(unsigned int codePoint)
+    {
+        return codePoint != 0xFFFFFFFF && codePoint >= 0x80;
+    }
+
     static void _lexIdentifier(Lexer* lexer)
     {
         for(;;)
@@ -348,12 +361,12 @@ namespace Slang
             if(('a' <= c ) && (c <= 'z')
                 || ('A' <= c) && (c <= 'Z')
                 || ('0' <= c) && (c <= '9')
-                || (c == '_'))
+                || (c == '_')
+                || isNonAsciiCodePoint((unsigned int)c))
             {
                 _advance(lexer);
                 continue;
             }
-
             return;
         }
     }
@@ -1052,7 +1065,8 @@ namespace Slang
 
     static TokenType _lexTokenImpl(Lexer* lexer)
     {
-        switch(_peek(lexer))
+        int nextCodePoint = _peek(lexer);
+        switch(nextCodePoint)
         {
         default:
             break;
@@ -1358,10 +1372,12 @@ namespace Slang
 
         }
 
-        // TODO(tfoley): If we ever wanted to support proper Unicode
-        // in identifiers, etc., then this would be the right place
-        // to perform a more expensive dispatch based on the actual
-        // code point (and not just the first byte).
+        // We treat all unicode characters as a part of an identifier.
+        if (isNonAsciiCodePoint(nextCodePoint))
+        {
+            _lexIdentifier(lexer);
+            return TokenType::Identifier;
+        }
 
         {
             // If none of the above cases matched, then we have an
author	Yong He <yonghe@outlook.com>	2024-08-12 20:53:03 -0700
committer	GitHub <noreply@github.com>	2024-08-12 20:53:03 -0700
commit	b390566b55700582321b09b72c726b8dff9bd819 (patch)
tree	a2fd8e50fcbde29dd2651e08a78021f2ae9d72de /source/compiler-core
parent	20bd48659d0009de5477380c335e2419f4c66f8b (diff)