summaryrefslogtreecommitdiff
path: root/source/compiler-core/slang-lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/compiler-core/slang-lexer.cpp')
-rw-r--r--source/compiler-core/slang-lexer.cpp40
1 files changed, 28 insertions, 12 deletions
diff --git a/source/compiler-core/slang-lexer.cpp b/source/compiler-core/slang-lexer.cpp
index 7d84ed938..366af9114 100644
--- a/source/compiler-core/slang-lexer.cpp
+++ b/source/compiler-core/slang-lexer.cpp
@@ -5,9 +5,9 @@
// input bytes and turning it into semantically useful tokens.
//
+#include "core/slang-char-encode.h"
#include "slang-name.h"
#include "slang-source-loc.h"
-
#include "slang-core-diagnostics.h"
namespace Slang
@@ -205,7 +205,6 @@ namespace Slang
c = e;
continue;
}
-
default:
break;
}
@@ -214,8 +213,12 @@ namespace Slang
// some newlines
break;
}
- // TODO: handle UTF-8 encoding for non-ASCII code points here
-
+ if (isUtf8LeadingByte((Byte)c))
+ {
+ // Consume all unicode characters.
+ pos--;
+ c = getUnicodePointFromUTF8([&]() {return lexer->m_cursor[pos++]; });
+ }
// Default case is to just hand along the byte we read as an ASCII code point.
} while (offset--);
@@ -262,7 +265,12 @@ namespace Slang
}
}
- // TODO: Need to handle non-ASCII code points.
+ // Consume all unicode characters.
+ if (isUtf8LeadingByte((Byte)c))
+ {
+ lexer->m_cursor--;
+ c = getUnicodePointFromUTF8([&]() {return *lexer->m_cursor++; });
+ }
// Default case is to return the raw byte we saw.
return c;
@@ -340,6 +348,11 @@ namespace Slang
}
}
+ static bool isNonAsciiCodePoint(unsigned int codePoint)
+ {
+ return codePoint != 0xFFFFFFFF && codePoint >= 0x80;
+ }
+
static void _lexIdentifier(Lexer* lexer)
{
for(;;)
@@ -348,12 +361,12 @@ namespace Slang
if(('a' <= c ) && (c <= 'z')
|| ('A' <= c) && (c <= 'Z')
|| ('0' <= c) && (c <= '9')
- || (c == '_'))
+ || (c == '_')
+ || isNonAsciiCodePoint((unsigned int)c))
{
_advance(lexer);
continue;
}
-
return;
}
}
@@ -1052,7 +1065,8 @@ namespace Slang
static TokenType _lexTokenImpl(Lexer* lexer)
{
- switch(_peek(lexer))
+ int nextCodePoint = _peek(lexer);
+ switch(nextCodePoint)
{
default:
break;
@@ -1358,10 +1372,12 @@ namespace Slang
}
- // TODO(tfoley): If we ever wanted to support proper Unicode
- // in identifiers, etc., then this would be the right place
- // to perform a more expensive dispatch based on the actual
- // code point (and not just the first byte).
+ // We treat all unicode characters as a part of an identifier.
+ if (isNonAsciiCodePoint(nextCodePoint))
+ {
+ _lexIdentifier(lexer);
+ return TokenType::Identifier;
+ }
{
// If none of the above cases matched, then we have an