summaryrefslogtreecommitdiff
path: root/source/compiler-core
diff options
context:
space:
mode:
authorYong He <yonghe@outlook.com>2024-08-12 20:53:03 -0700
committerGitHub <noreply@github.com>2024-08-12 20:53:03 -0700
commitb390566b55700582321b09b72c726b8dff9bd819 (patch)
treea2fd8e50fcbde29dd2651e08a78021f2ae9d72de /source/compiler-core
parent20bd48659d0009de5477380c335e2419f4c66f8b (diff)
Support unicode identifier names. (#4772)
* Support unicode identifier names. * Fix. * Fix language server. * Fix build errors. * Fix. * Fix offset translation in language server.
Diffstat (limited to 'source/compiler-core')
-rw-r--r--source/compiler-core/slang-lexer.cpp40
1 files changed, 28 insertions, 12 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index 7d84ed938..366af9114 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -5,9 +5,9 @@
// input bytes and turning it into semantically useful tokens.
//
+#include "core/slang-char-encode.h"
#include "slang-name.h"
#include "slang-source-loc.h"
-
#include "slang-core-diagnostics.h"
namespace Slang
@@ -205,7 +205,6 @@ namespace Slang
c = e;
continue;
}
-
default:
break;
}
@@ -214,8 +213,12 @@ namespace Slang
// some newlines
break;
}
- // TODO: handle UTF-8 encoding for non-ASCII code points here
-
+ if (isUtf8LeadingByte((Byte)c))
+ {
+ // Consume all unicode characters.
+ pos--;
+ c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; });
+ }
// Default case is to just hand along the byte we read as an ASCII code point.
} while (offset--);
@@ -262,7 +265,12 @@ namespace Slang
}
}
- // TODO: Need to handle non-ASCII code points.
+ // Consume all unicode characters.
+ if (isUtf8LeadingByte((Byte)c))
+ {
+ lexer->m_cursor--;
+ c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; });
+ }
// Default case is to return the raw byte we saw.
return c;
@@ -340,6 +348,11 @@ namespace Slang
}
}
+ static bool isNonAsciiCodePoint(unsigned int codePoint)
+ {
+ return codePoint != 0xFFFFFFFF && codePoint >= 0x80;
+ }
+
static void _lexIdentifier(Lexer* lexer)
{
for(;;)
@@ -348,12 +361,12 @@ namespace Slang
if(('a' <= c ) && (c <= 'z')
|| ('A' <= c) && (c <= 'Z')
|| ('0' <= c) && (c <= '9')
- || (c == '_'))
+ || (c == '_')
+ || isNonAsciiCodePoint((unsigned int)c))
{
_advance(lexer);
continue;
}
-
return;
}
}
@@ -1052,7 +1065,8 @@ namespace Slang
static TokenType _lexTokenImpl(Lexer* lexer)
{
- switch(_peek(lexer))
+ int nextCodePoint = _peek(lexer);
+ switch(nextCodePoint)
{
default:
break;
@@ -1358,10 +1372,12 @@ namespace Slang
}
- // TODO(tfoley): If we ever wanted to support proper Unicode
- // in identifiers, etc., then this would be the right place
- // to perform a more expensive dispatch based on the actual
- // code point (and not just the first byte).
+ // We treat all unicode characters as a part of an identifier.
+ if (isNonAsciiCodePoint(nextCodePoint))
+ {
+ _lexIdentifier(lexer);
+ return TokenType::Identifier;
+ }
{
// If none of the above cases matched, then we have an