diff options
Diffstat (limited to 'source')
| -rw-r--r-- | source/compiler-core/slang-lexer.cpp | 40 | ||||
| -rw-r--r-- | source/core/slang-char-encode.cpp | 23 | ||||
| -rw-r--r-- | source/core/slang-char-encode.h | 4 | ||||
| -rw-r--r-- | source/core/slang-char-util.h | 2 | ||||
| -rw-r--r-- | source/core/slang-std-writers.cpp | 9 | ||||
| -rw-r--r-- | source/slang/slang-language-server-document-symbols.cpp | 2 | ||||
| -rw-r--r-- | source/slang/slang-language-server.cpp | 34 | ||||
| -rw-r--r-- | source/slang/slang-workspace-version.cpp | 87 | ||||
| -rw-r--r-- | source/slang/slang-workspace-version.h | 15 |
9 files changed, 159 insertions, 57 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp index 7d84ed938..366af9114 100644 --- a/source/compiler-core/slang-lexer.cpp +++ b/source/compiler-core/slang-lexer.cpp @@ -5,9 +5,9 @@ // input bytes and turning it into semantically useful tokens. // +#include "core/slang-char-encode.h" #include "slang-name.h" #include "slang-source-loc.h" - #include "slang-core-diagnostics.h" namespace Slang @@ -205,7 +205,6 @@ namespace Slang c = e; continue; } - default: break; } @@ -214,8 +213,12 @@ namespace Slang // some newlines break; } - // TODO: handle UTF-8 encoding for non-ASCII code points here - + if (isUtf8LeadingByte((Byte)c)) + { + // Consume all unicode characters. + pos--; + c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; }); + } // Default case is to just hand along the byte we read as an ASCII code point. } while (offset--); @@ -262,7 +265,12 @@ namespace Slang } } - // TODO: Need to handle non-ASCII code points. + // Consume all unicode characters. + if (isUtf8LeadingByte((Byte)c)) + { + lexer->m_cursor--; + c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; }); + } // Default case is to return the raw byte we saw. return c; @@ -340,6 +348,11 @@ namespace Slang } } + static bool isNonAsciiCodePoint(unsigned int codePoint) + { + return codePoint != 0xFFFFFFFF && codePoint >= 0x80; + } + static void _lexIdentifier(Lexer* lexer) { for(;;) @@ -348,12 +361,12 @@ namespace Slang if(('a' <= c ) && (c <= 'z') || ('A' <= c) && (c <= 'Z') || ('0' <= c) && (c <= '9') - || (c == '_')) + || (c == '_') + || isNonAsciiCodePoint((unsigned int)c)) { _advance(lexer); continue; } - return; } } @@ -1052,7 +1065,8 @@ namespace Slang static TokenType _lexTokenImpl(Lexer* lexer) { - switch(_peek(lexer)) + int nextCodePoint = _peek(lexer); + switch(nextCodePoint) { default: break; @@ -1358,10 +1372,12 @@ namespace Slang } - // TODO(tfoley): If we ever wanted to support proper Unicode - // in identifiers, etc., then this would be the right place - // to perform a more expensive dispatch based on the actual - // code point (and not just the first byte). + // We treat all unicode characters as a part of an identifier. + if (isNonAsciiCodePoint(nextCodePoint)) + { + _lexIdentifier(lexer); + return TokenType::Identifier; + } { // If none of the above cases matched, then we have an diff --git a/source/core/slang-char-encode.cpp b/source/core/slang-char-encode.cpp index 105cfac7f..526c6c923 100644 --- a/source/core/slang-char-encode.cpp +++ b/source/core/slang-char-encode.cpp @@ -211,4 +211,27 @@ CharEncoding* CharEncoding::UTF32 = &_utf32Encoding; return count; } +Index UTF8Util::calcUTF16CharCount(const UnownedStringSlice& in) +{ + Index count = 0; + Index readPtr = 0; + for (;;) + { + int c = getUnicodePointFromUTF8([&]() -> Byte + { + if (readPtr < in.getLength()) + return in[readPtr++]; + else + return 0; + }); + if (c == 0) + break; + Char16 buffer[2]; + count += encodeUnicodePointToUTF16(c, buffer); + if (readPtr >= in.getLength()) + break; + } + return count; +} + } // namespace Slang diff --git a/source/core/slang-char-encode.h b/source/core/slang-char-encode.h index 2bb4cba29..a7cd501ab 100644 --- a/source/core/slang-char-encode.h +++ b/source/core/slang-char-encode.h @@ -203,6 +203,10 @@ struct UTF8Util /// Non valid utf8 input or ending starting in partial characters, will produce /// undefined results without error. static Index calcCodePointCount(const UnownedStringSlice& in); + + + /// Given a slice in UTF8, calculate the number of UTF16 characters needed to represent the string. + static Index calcUTF16CharCount(const UnownedStringSlice& in); }; } diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h index c65f676c4..88af24426 100644 --- a/source/core/slang-char-util.h +++ b/source/core/slang-char-util.h @@ -61,7 +61,7 @@ struct CharUtil /// Returns the value if c interpretted as a octal digit /// If c is not a valid octal returns -1 inline static int getOctalDigitValue(char c) { return isOctalDigit(c) ? (c - '0') : -1; } - + struct CharFlagMap { Flags flags[0x100]; diff --git a/source/core/slang-std-writers.cpp b/source/core/slang-std-writers.cpp index a23d878fb..264f37c98 100644 --- a/source/core/slang-std-writers.cpp +++ b/source/core/slang-std-writers.cpp @@ -1,6 +1,10 @@ #include "slang-std-writers.h" +#if SLANG_WINDOWS_FAMILY +#include <Windows.h> +#endif + namespace Slang { @@ -8,8 +12,11 @@ namespace Slang /* static */RefPtr<StdWriters> StdWriters::createDefault() { +#if SLANG_WINDOWS_FAMILY + SetConsoleCP(CP_UTF8); + SetConsoleOutputCP(CP_UTF8); +#endif RefPtr<StdWriters> stdWriters(new StdWriters); - RefPtr<FileWriter> stdError(new FileWriter(stderr, WriterFlag::AutoFlush | WriterFlag::IsUnowned)); RefPtr<FileWriter> stdOut(new FileWriter(stdout, WriterFlag::AutoFlush | WriterFlag::IsUnowned)); diff --git a/source/slang/slang-language-server-document-symbols.cpp b/source/slang/slang-language-server-document-symbols.cpp index ec9b434eb..45a76e97b 100644 --- a/source/slang/slang-language-server-document-symbols.cpp +++ b/source/slang/slang-language-server-document-symbols.cpp @@ -167,7 +167,7 @@ namespace Slang sym.selectionRange.start.line = (int)line; sym.selectionRange.start.character = (int)col; sym.selectionRange.end.line = (int)line; - sym.selectionRange.end.character = (int)(col + nameLoc.name->text.getLength()); + sym.selectionRange.end.character = (int)(col + (int)UTF8Util::calcUTF16CharCount(nameLoc.name->text.getUnownedSlice())); sym.range.start.line = (int)line; sym.range.start.character = 0; sym.range.end.line = (int)line; diff --git a/source/slang/slang-language-server.cpp b/source/slang/slang-language-server.cpp index 09b14932c..ed03a5dc4 100644 --- a/source/slang/slang-language-server.cpp +++ b/source/slang/slang-language-server.cpp @@ -654,9 +654,12 @@ SlangResult LanguageServer::hover( maybeAppendAdditionalOverloadsHint(); auto nodeHumaneLoc = version->linkage->getSourceManager()->getHumaneLoc(leafNode->loc); - hover.range.start.line = int(nodeHumaneLoc.line - 1); - hover.range.end.line = int(nodeHumaneLoc.line - 1); - hover.range.start.character = int(nodeHumaneLoc.column - 1); + doc->oneBasedUTF8LocToZeroBasedUTF16Loc( + nodeHumaneLoc.line, + nodeHumaneLoc.column, + hover.range.start.line, + hover.range.start.character); + hover.range.end = hover.range.start; auto name = declRef.getName(); if (auto ctorDecl = declRef.as<ConstructorDecl>()) { @@ -668,17 +671,19 @@ SlangResult LanguageServer::hover( } if (name) { - hover.range.end.character = int(nodeHumaneLoc.column + name->text.getLength() - 1); + hover.range.end.character = hover.range.start.character + (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()); } } }; auto fillLoc = [&](SourceLoc loc) { auto humaneLoc = version->linkage->getSourceManager()->getHumaneLoc(loc, SourceLocType::Actual); - hover.range.start.line = int(humaneLoc.line - 1); - hover.range.end.line = int(humaneLoc.line - 1); - hover.range.start.character = int(humaneLoc.column - 1); - hover.range.end.character = hover.range.start.character + int(doc->getTokenLength(humaneLoc.line, humaneLoc.column)); + doc->oneBasedUTF8LocToZeroBasedUTF16Loc(humaneLoc.line, humaneLoc.column, hover.range.start.line, hover.range.start.character); + doc->oneBasedUTF8LocToZeroBasedUTF16Loc( + humaneLoc.line, + humaneLoc.column + doc->getTokenLength(humaneLoc.line, humaneLoc.column), + hover.range.end.line, + hover.range.end.character); }; auto fillExprHoverInfo = [&](Expr* expr) { @@ -851,7 +856,7 @@ SlangResult LanguageServer::gotoDefinition( : declRefExpr->declRef.getLoc(), SourceLocType::Actual); auto name = declRefExpr->declRef.getName(); - locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0}); + locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0}); } } else if (auto overloadedExpr = as<OverloadedExpr>(leafNode)) @@ -863,7 +868,7 @@ SlangResult LanguageServer::gotoDefinition( auto location = version->linkage->getSourceManager()->getHumaneLoc( item.declRef.getNameLoc(), SourceLocType::Actual); auto name = item.declRef.getName(); - locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0}); + locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0}); } } else @@ -874,7 +879,7 @@ SlangResult LanguageServer::gotoDefinition( auto location = version->linkage->getSourceManager()->getHumaneLoc( item.declRef.getNameLoc(), SourceLocType::Actual); auto name = item.declRef.getName(); - locations.add(LocationResult{location, name ? (int)name->text.getLength() : 0}); + locations.add(LocationResult{location, name ? (int)UTF8Util::calcUTF16CharCount(name->text.getUnownedSlice()) : 0}); } } } @@ -909,8 +914,11 @@ SlangResult LanguageServer::gotoDefinition( { result.uri = URI::fromLocalFilePath(loc.loc.pathInfo.foundPath.getUnownedSlice()).uri; - result.range.start.line = int(loc.loc.line - 1); - result.range.start.character = int(loc.loc.column - 1); + doc->oneBasedUTF8LocToZeroBasedUTF16Loc( + loc.loc.line, + loc.loc.column, + result.range.start.line, + result.range.start.character); result.range.end = result.range.start; result.range.end.character += loc.length; results.add(result); diff --git a/source/slang/slang-workspace-version.cpp b/source/slang/slang-workspace-version.cpp index a07ef75cc..d85724328 100644 --- a/source/slang/slang-workspace-version.cpp +++ b/source/slang/slang-workspace-version.cpp @@ -396,42 +396,66 @@ void DocumentVersion::setText(const String& newText) { text = newText; StringUtil::calcLines(text.getUnownedSlice(), lines); - utf16CharStarts.clear(); + mapUTF16CharIndexToCodePointIndex.clear(); + mapCodePointIndexToUTF8ByteOffset.clear(); } -ArrayView<Index> DocumentVersion::getUTF16Boundaries(Index line) + +void DocumentVersion::ensureUTFBoundsAvailable() { - if (!utf16CharStarts.getCount()) + for (auto slice : lines) { - for (auto slice : lines) + List<Index> bounds; + List<Index> utf8Bounds; + Index index = 0; + Index codePointIndex = 0; + while (index < slice.getLength()) { - List<Index> bounds; - Index index = 0; - while (index < slice.getLength()) - { - auto startIndex = index; - const Char32 codePoint = getUnicodePointFromUTF8( - [&]() -> Byte - { - if (index < slice.getLength()) - return slice[index++]; - else - return '\0'; - }); - if (!codePoint) - break; - Char16 buffer[2]; - int count = encodeUnicodePointToUTF16Reversed(codePoint, buffer); - for (int i = 0; i < count; i++) - bounds.add(startIndex); - } - bounds.add(slice.getLength()); - utf16CharStarts.add(_Move(bounds)); + auto startIndex = index; + const Char32 codePoint = getUnicodePointFromUTF8( + [&]() -> Byte + { + if (index < slice.getLength()) + return slice[index++]; + else + return '\0'; + }); + if (!codePoint) + break; + + Char16 buffer[2]; + int count = encodeUnicodePointToUTF16Reversed(codePoint, buffer); + for (int i = 0; i < count; i++) + bounds.add(codePointIndex); + utf8Bounds.add(startIndex); + codePointIndex++; } + bounds.add(slice.getLength()); + utf8Bounds.add(slice.getLength()); + mapUTF16CharIndexToCodePointIndex.add(_Move(bounds)); + mapCodePointIndexToUTF8ByteOffset.add(_Move(utf8Bounds)); } - return line >= 1 && line <= utf16CharStarts.getCount() ? utf16CharStarts[line - 1].getArrayView() +} + +ArrayView<Index> DocumentVersion::getUTF16Boundaries(Index line) +{ + if (!mapUTF16CharIndexToCodePointIndex.getCount()) + { + ensureUTFBoundsAvailable(); + } + return line >= 1 && line <= mapUTF16CharIndexToCodePointIndex.getCount() ? mapUTF16CharIndexToCodePointIndex[line - 1].getArrayView() : ArrayView<Index>(); } +ArrayView<Index> DocumentVersion::getUTF8Boundaries(Index line) +{ + if (!mapCodePointIndexToUTF8ByteOffset.getCount()) + { + ensureUTFBoundsAvailable(); + } + return line >= 1 && line <= mapCodePointIndexToUTF8ByteOffset.getCount() ? mapCodePointIndexToUTF8ByteOffset[line - 1].getArrayView() + : ArrayView<Index>(); +} + void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc( Index inLine, Index inCol, Index& outLine, Index& outCol) { @@ -447,6 +471,15 @@ void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc( outCol = std::lower_bound(bounds.begin(), bounds.end(), inCol - 1) - bounds.begin(); } +void DocumentVersion::oneBasedUTF8LocToZeroBasedUTF16Loc( + Index inLine, Index inCol, int& outLine, int& outCol) +{ + Index ioutLine, ioutCol; + oneBasedUTF8LocToZeroBasedUTF16Loc(inLine, inCol, ioutLine, ioutCol); + outLine = (int)ioutLine; + outCol = (int)ioutCol; +} + void DocumentVersion::zeroBasedUTF16LocToOneBasedUTF8Loc( Index inLine, Index inCol, Index& outLine, Index& outCol) { diff --git a/source/slang/slang-workspace-version.h b/source/slang/slang-workspace-version.h index 44ab6b43c..d6cbfe6c5 100644 --- a/source/slang/slang-workspace-version.h +++ b/source/slang/slang-workspace-version.h @@ -20,7 +20,8 @@ namespace Slang String path; String text; List<UnownedStringSlice> lines; - List<List<Index>> utf16CharStarts; + List<List<Index>> mapUTF16CharIndexToCodePointIndex; + List<List<Index>> mapCodePointIndexToUTF8ByteOffset; public: void setPath(String filePath) { @@ -32,10 +33,14 @@ namespace Slang const String& getText() { return text; } void setText(const String& newText); + void ensureUTFBoundsAvailable(); ArrayView<Index> getUTF16Boundaries(Index line); + ArrayView<Index> getUTF8Boundaries(Index line); void oneBasedUTF8LocToZeroBasedUTF16Loc( Index inLine, Index inCol, Index& outLine, Index& outCol); + void oneBasedUTF8LocToZeroBasedUTF16Loc( + Index inLine, Index inCol, int& outLine, int& outCol); void zeroBasedUTF16LocToOneBasedUTF8Loc( Index inLine, Index inCol, Index& outLine, Index& outCol); @@ -60,7 +65,11 @@ namespace Slang return -1; Index lineStart = lineIndex >= 1 ? getLineStart(lines[lineIndex - 1]) : 0; - return lineStart + colIndex - 1; + auto boundaries = getUTF8Boundaries(lineIndex); + Index byteOffset = 0; + if (colIndex > 0 && colIndex <= boundaries.getCount()) + byteOffset = boundaries[colIndex - 1]; + return lineStart + byteOffset; } // Get 1-based, utf-8 encoding location from offset. @@ -81,6 +90,8 @@ namespace Slang { col = Index(offset - getLineStart(lines[line-1])) + 1; } + if (line > 0 && line <= lines.getCount()) + col = UTF8Util::calcCodePointCount(lines[line-1].head(col)); } // Get line from 1-based index. |
