summaryrefslogtreecommitdiffstats
path: root/source
diff options
context:
space:
mode:
authorYong He <yonghe@outlook.com>2024-08-12 20:53:03 -0700
committerGitHub <noreply@github.com>2024-08-12 20:53:03 -0700
commitb390566b55700582321b09b72c726b8dff9bd819 (patch)
treea2fd8e50fcbde29dd2651e08a78021f2ae9d72de /source
parent20bd48659d0009de5477380c335e2419f4c66f8b (diff)
Support unicode identifier names. (#4772)
* Support unicode identifier names. * Fix. * Fix language server. * Fix build errors. * Fix. * Fix offset translation in language server.
Diffstat (limited to 'source')
-rw-r--r--source/compiler-core/slang-lexer.cpp40
-rw-r--r--source/core/slang-char-encode.cpp23
-rw-r--r--source/core/slang-char-encode.h4
-rw-r--r--source/core/slang-char-util.h2
-rw-r--r--source/core/slang-std-writers.cpp9
-rw-r--r--source/slang/slang-language-server-document-symbols.cpp2
-rw-r--r--source/slang/slang-language-server.cpp34
-rw-r--r--source/slang/slang-workspace-version.cpp87
-rw-r--r--source/slang/slang-workspace-version.h15
9 files changed, 159 insertions, 57 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index 7d84ed938..366af9114 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -5,9 +5,9 @@
// input bytes and turning it into semantically useful tokens.
//
+#include "core/slang-char-encode.h"
#include "slang-name.h"
#include "slang-source-loc.h"
-
#include "slang-core-diagnostics.h"
namespace Slang
@@ -205,7 +205,6 @@ namespace Slang
c = e;
continue;
}
-
default:
break;
}
@@ -214,8 +213,12 @@ namespace Slang
// some newlines
break;
}
- // TODO: handle UTF-8 encoding for non-ASCII code points here
-
+ if (isUtf8LeadingByte((Byte)c))
+ {
+ // Consume all unicode characters.
+ pos--;
+ c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; });
+ }
// Default case is to just hand along the byte we read as an ASCII code point.
} while (offset--);
@@ -262,7 +265,12 @@ namespace Slang
}
}
- // TODO: Need to handle non-ASCII code points.
+ // Consume all unicode characters.
+ if (isUtf8LeadingByte((Byte)c))
+ {
+ lexer->m_cursor--;
+ c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; });
+ }
// Default case is to return the raw byte we saw.
return c;
@@ -340,6 +348,11 @@ namespace Slang
}
}
+ static bool isNonAsciiCodePoint(unsigned int codePoint)
+ {
+ return codePoint != 0xFFFFFFFF && codePoint >= 0x80;
+ }
+
static void _lexIdentifier(Lexer* lexer)
{
for(;;)
@@ -348,12 +361,12 @@ namespace Slang
if(('a' <= c ) && (c <= 'z')
|| ('A' <= c) && (c <= 'Z')
|| ('0' <= c) && (c <= '9')
- || (c == '_'))
+ || (c == '_')
+ || isNonAsciiCodePoint((unsigned int)c))
{
_advance(lexer);
continue;
}
-
return;
}
}
@@ -1052,7 +1065,8 @@ namespace Slang
static TokenType _lexTokenImpl(Lexer* lexer)
{
- switch(_peek(lexer))
+ int nextCodePoint = _peek(lexer);
+ switch(nextCodePoint)
{
default:
break;
@@ -1358,10 +1372,12 @@ namespace Slang
}
- // TODO(tfoley): If we ever wanted to support proper Unicode
- // in identifiers, etc., then this would be the right place
- // to perform a more expensive dispatch based on the actual
- // code point (and not just the first byte).
+ // We treat all unicode characters as a part of an identifier.
+ if (isNonAsciiCodePoint(nextCodePoint))
+ {
+ _lexIdentifier(lexer);
+ return TokenType::Identifier;
+ }
{
// If none of the above cases matched, then we have an
diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp
index 105cfac7f..526c6c923 100644
--- a/source/core/slang-char-encode.cpp
+++ b/source/core/slang-char-encode.cpp
@@ -211,4 +211,27 @@ CharEncoding* CharEncoding::UTF32 = &_utf32Encoding;
return count;
}
+Index UTF8Util::calcUTF16CharCount(const UnownedStringSlice& in)
+{
+ Index count = 0;
+ Index readPtr = 0;
+ for (;;)
+ {
+ int c = getUnicodePointFromUTF8([&]() -> Byte
+ {
+ if (readPtr < in.getLength())
+ return in[readPtr++];
+ else
+ return 0;
+ });
+ if (c == 0)
+ break;
+ Char16 buffer[2];
+ count += encodeUnicodePointToUTF16(c, buffer);
+ if (readPtr >= in.getLength())
+ break;
+ }
+ return count;
+}
+
} // namespace Slang
diff --git a/source/core/slang-char-encode.h b/source/core/slang-char-encode.h
index 2bb4cba29..a7cd501ab 100644
--- a/source/core/slang-char-encode.h
+++ b/source/core/slang-char-encode.h
@@ -203,6 +203,10 @@ struct UTF8Util
/// Non valid utf8 input or ending starting in partial characters, will produce
/// undefined results without error.
static Index calcCodePointCount(const UnownedStringSlice& in);
+
+
+ /// Given a slice in UTF8, calculate the number of UTF16 characters needed to represent the string.
+ static Index calcUTF16CharCount(const UnownedStringSlice& in);
};
}
diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h
index c65f676c4..88af24426 100644
--- a/source/core/slang-char-util.h
+++ b/source/core/slang-char-util.h
@@ -61,7 +61,7 @@ struct CharUtil
/// Returns the value if c interpretted as a octal digit
/// If c is not a valid octal returns -1
inline static int getOctalDigitValue(char c) { return isOctalDigit(c) ? (c - '0') : -1; }
-
+
struct CharFlagMap
{
Flags flags[0x100];
diff --git a/source/core/slang-std-writers.cpp b/source/core/slang-std-writers.cpp
index a23d878fb..264f37c98 100644
--- a/source/core/slang-std-writers.cpp
+++ b/source/core/slang-std-writers.cpp
@@ -1,6 +1,10 @@
#include "slang-std-writers.h"
+#if SLANG_WINDOWS_FAMILY
+#include <Windows.h>
+#endif
+
namespace Slang
{
@@ -8,8 +12,11 @@ namespace Slang
/* static */RefPtr<StdWriters> StdWriters::createDefault()
{
+#if SLANG_WINDOWS_FAMILY
+ SetConsoleCP(CP_UTF8);
+ SetConsoleOutputCP(CP_UTF8);
+#endif
RefPtr<StdWriters> stdWriters(new StdWriters);
-
RefPtr<FileWriter> stdError(new FileWriter(stderr, WriterFlag::AutoFlush | WriterFlag::IsUnowned));
RefPtr<FileWriter> stdOut(new FileWriter(stdout, WriterFlag::AutoFlush | WriterFlag::IsUnowned));
diff --git a/source/slang/slang-language-server-document-symbols.cpp b/source/slang/slang-language-server-document-symbols.cpp
index ec9b434eb..45a76e97b 100644
--- a/source/slang/slang-language-server-document-symbols.cpp
+++ b/source/slang/slang-language-server-document-symbols.cpp
@@ -167,7 +167,7 @@ namespace Slang
sym.selectionRange.start.line = (int)line;
sym.selectionRange.start.character = (int)col;
sym.selectionRange.end.line = (int)line;
- sym.selectionRange.end.character = (int)(col + nameLoc.name->text.getLength());
+ sym.selectionRange.end.character = (int)(col + (int)UTF8Util::calcUTF16CharCount(nameLoc.name->text.getUnownedSlice()));
sym.range.start.line = (int)line;
sym.range.start.character = 0;
sym.range.end.line = (int)line;
diff --git a/source/slang/slang-language-server.cpp b/source/slang/slang-language-server.cpp
index 09b14932c..ed03a5dc4 100644
--- a/source/slang/slang-language-server.cpp
+++ b/source/slang/slang-language-server.cpp
@@ -654,9 +654,12 @@ SlangResult LanguageServer::hover(
maybeAppendAdditionalOverloadsHint();
auto nodeHumaneLoc =
version->linkage->getSourceManager()->getHumaneLoc(leafNode->loc);
- hover.range.start.line = int(nodeHumaneLoc.line - 1);
- hover.range.end.line = int(nodeHumaneLoc.line - 1);
- hover.range.start.character = int(nodeHumaneLoc.column - 1);
+ doc->oneBasedUTF8LocToZeroBasedUTF16Loc(
+ nodeHumaneLoc.line,
+ nodeHumaneLoc.column,
+ hover.range.start.line,
+ hover.range.start.character);
+ hover.range.end = hover.range.start;
auto name = declRef.getName();
if (auto ctorDecl = declRef.as<ConstructorDecl>())
{
@@ -668,17 +671,19 @@ SlangResult LanguageServer::hover(
}
if (name)
{
- hover.range.end.character = int(nodeHumaneLoc.column + name->text.getLength() - 1);
+ hover.range.end.character = hover.range.start.character + (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice());
}
}
};
auto fillLoc = [&](SourceLoc loc)
{
auto humaneLoc = version->linkage->getSourceManager()->getHumaneLoc(loc, SourceLocType::Actual);
- hover.range.start.line = int(humaneLoc.line - 1);
- hover.range.end.line = int(humaneLoc.line - 1);
- hover.range.start.character = int(humaneLoc.column - 1);
- hover.range.end.character = hover.range.start.character + int(doc->getTokenLength(humaneLoc.line, humaneLoc.column));
+ doc->oneBasedUTF8LocToZeroBasedUTF16Loc(humaneLoc.line, humaneLoc.column, hover.range.start.line, hover.range.start.character);
+ doc->oneBasedUTF8LocToZeroBasedUTF16Loc(
+ humaneLoc.line,
+ humaneLoc.column + doc->getTokenLength(humaneLoc.line, humaneLoc.column),
+ hover.range.end.line,
+ hover.range.end.character);
};
auto fillExprHoverInfo = [&](Expr* expr)
{
@@ -851,7 +856,7 @@ SlangResult LanguageServer::gotoDefinition(
: declRefExpr->declRef.getLoc(),
SourceLocType::Actual);
auto name = declRefExpr->declRef.getName();
- locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0});
+ locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0});
}
}
else if (auto overloadedExpr = as<OverloadedExpr>(leafNode))
@@ -863,7 +868,7 @@ SlangResult LanguageServer::gotoDefinition(
auto location = version->linkage->getSourceManager()->getHumaneLoc(
item.declRef.getNameLoc(), SourceLocType::Actual);
auto name = item.declRef.getName();
- locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0});
+ locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0});
}
}
else
@@ -874,7 +879,7 @@ SlangResult LanguageServer::gotoDefinition(
auto location = version->linkage->getSourceManager()->getHumaneLoc(
item.declRef.getNameLoc(), SourceLocType::Actual);
auto name = item.declRef.getName();
- locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0});
+ locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0});
}
}
}
@@ -909,8 +914,11 @@ SlangResult LanguageServer::gotoDefinition(
{
result.uri =
URI::fromLocalFilePath(loc.loc.pathInfo.foundPath.getUnownedSlice()).uri;
- result.range.start.line = int(loc.loc.line - 1);
- result.range.start.character = int(loc.loc.column - 1);
+ doc->oneBasedUTF8LocToZeroBasedUTF16Loc(
+ loc.loc.line,
+ loc.loc.column,
+ result.range.start.line,
+ result.range.start.character);
result.range.end = result.range.start;
result.range.end.character += loc.length;
results.add(result);
diff --git a/source/slang/slang-workspace-version.cpp b/source/slang/slang-workspace-version.cpp
index a07ef75cc..d85724328 100644
--- a/source/slang/slang-workspace-version.cpp
+++ b/source/slang/slang-workspace-version.cpp
@@ -396,42 +396,66 @@ void DocumentVersion::setText(const String& newText)
{
text = newText;
StringUtil::calcLines(text.getUnownedSlice(), lines);
- utf16CharStarts.clear();
+ mapUTF16CharIndexToCodePointIndex.clear();
+ mapCodePointIndexToUTF8ByteOffset.clear();
}
-ArrayView<Index> DocumentVersion::getUTF16Boundaries(Index line)
+
+void DocumentVersion::ensureUTFBoundsAvailable()
{
- if (!utf16CharStarts.getCount())
+ for (auto slice : lines)
{
- for (auto slice : lines)
+ List<Index> bounds;
+ List<Index> utf8Bounds;
+ Index index = 0;
+ Index codePointIndex = 0;
+ while (index < slice.getLength())
{
- List<Index> bounds;
- Index index = 0;
- while (index < slice.getLength())
- {
- auto startIndex = index;
- const Char32 codePoint = getUnicodePointFromUTF8(
- [&]() -> Byte
- {
- if (index < slice.getLength())
- return slice[index++];
- else
- return '\0';
- });
- if (!codePoint)
- break;
- Char16 buffer[2];
- int count = encodeUnicodePointToUTF16Reversed(codePoint, buffer);
- for (int i = 0; i < count; i++)
- bounds.add(startIndex);
- }
- bounds.add(slice.getLength());
- utf16CharStarts.add(_Move(bounds));
+ auto startIndex = index;
+ const Char32 codePoint = getUnicodePointFromUTF8(
+ [&]() -> Byte
+ {
+ if (index < slice.getLength())
+ return slice[index++];
+ else
+ return '\0';
+ });
+ if (!codePoint)
+ break;
+
+ Char16 buffer[2];
+ int count = encodeUnicodePointToUTF16Reversed(codePoint, buffer);
+ for (int i = 0; i < count; i++)
+ bounds.add(codePointIndex);
+ utf8Bounds.add(startIndex);
+ codePointIndex++;
}
+ bounds.add(slice.getLength());
+ utf8Bounds.add(slice.getLength());
+ mapUTF16CharIndexToCodePointIndex.add(_Move(bounds));
+ mapCodePointIndexToUTF8ByteOffset.add(_Move(utf8Bounds));
}
- return line >= 1 && line <= utf16CharStarts.getCount() ? utf16CharStarts[line - 1].getArrayView()
+}
+
+ArrayView<Index> DocumentVersion::getUTF16Boundaries(Index line)
+{
+ if (!mapUTF16CharIndexToCodePointIndex.getCount())
+ {
+ ensureUTFBoundsAvailable();
+ }
+ return line >= 1 && line <= mapUTF16CharIndexToCodePointIndex.getCount() ? mapUTF16CharIndexToCodePointIndex[line - 1].getArrayView()
: ArrayView<Index>();
}
+ArrayView<Index> DocumentVersion::getUTF8Boundaries(Index line)
+{
+ if (!mapCodePointIndexToUTF8ByteOffset.getCount())
+ {
+ ensureUTFBoundsAvailable();
+ }
+ return line >= 1 && line <= mapCodePointIndexToUTF8ByteOffset.getCount() ? mapCodePointIndexToUTF8ByteOffset[line - 1].getArrayView()
+ : ArrayView<Index>();
+}
+
void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc(
Index inLine, Index inCol, Index& outLine, Index& outCol)
{
@@ -447,6 +471,15 @@ void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc(
outCol = std::lower_bound(bounds.begin(), bounds.end(), inCol - 1) - bounds.begin();
}
+void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc(
+ Index inLine, Index inCol, int& outLine, int& outCol)
+{
+ Index ioutLine, ioutCol;
+ oneBasedUTF8LocToZeroBasedUTF16Loc(inLine, inCol, ioutLine, ioutCol);
+ outLine = (int)ioutLine;
+ outCol = (int)ioutCol;
+}
+
void DocumentVersion::zeroBasedUTF16LocToOneBasedUTF8Loc(
Index inLine, Index inCol, Index& outLine, Index& outCol)
{
diff --git a/source/slang/slang-workspace-version.h b/source/slang/slang-workspace-version.h
index 44ab6b43c..d6cbfe6c5 100644
--- a/source/slang/slang-workspace-version.h
+++ b/source/slang/slang-workspace-version.h
@@ -20,7 +20,8 @@ namespace Slang
String path;
String text;
List<UnownedStringSlice> lines;
- List<List<Index>> utf16CharStarts;
+ List<List<Index>> mapUTF16CharIndexToCodePointIndex;
+ List<List<Index>> mapCodePointIndexToUTF8ByteOffset;
public:
void setPath(String filePath)
{
@@ -32,10 +33,14 @@ namespace Slang
const String& getText() { return text; }
void setText(const String& newText);
+ void ensureUTFBoundsAvailable();
ArrayView<Index> getUTF16Boundaries(Index line);
+ ArrayView<Index> getUTF8Boundaries(Index line);
void oneBasedUTF8LocToZeroBasedUTF16Loc(
Index inLine, Index inCol, Index& outLine, Index& outCol);
+ void oneBasedUTF8LocToZeroBasedUTF16Loc(
+ Index inLine, Index inCol, int& outLine, int& outCol);
void zeroBasedUTF16LocToOneBasedUTF8Loc(
Index inLine, Index inCol, Index& outLine, Index& outCol);
@@ -60,7 +65,11 @@ namespace Slang
return -1;
Index lineStart = lineIndex >= 1 ? getLineStart(lines[lineIndex - 1]) : 0;
- return lineStart + colIndex - 1;
+ auto boundaries = getUTF8Boundaries(lineIndex);
+ Index byteOffset = 0;
+ if (colIndex > 0 && colIndex <= boundaries.getCount())
+ byteOffset = boundaries[colIndex - 1];
+ return lineStart + byteOffset;
}
// Get 1-based, utf-8 encoding location from offset.
@@ -81,6 +90,8 @@ namespace Slang
{
col = Index(offset - getLineStart(lines[line-1])) + 1;
}
+ if (line > 0 && line <= lines.getCount())
+ col = UTF8Util::calcCodePointCount(lines[line-1].head(col));
}
// Get line from 1-based index.