summaryrefslogtreecommitdiff
path: root/source/compiler-core/slang-json-lexer.cpp
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2021-05-25 20:58:43 -0400
committerGitHub <noreply@github.com>2021-05-25 20:58:43 -0400
commit7d1b8ac13faf80ed56b37243480d097059da5aab (patch)
tree6613b13983083d16b8945c6d92b1f4f1d1fb2501 /source/compiler-core/slang-json-lexer.cpp
parent89f67d9c626fa193dba4adafcb54e46b13aa5e98 (diff)
JSON Lexing and string encoding/decoding (#1858)
* #include an absolute path didn't work - because paths were taken to always be relative. * WIP Json lexer. * Check JSON Lex with unit test * Add JSON escaping/unescaping of strings. * Big fix encoding/decoding. * Fix typo in JSON diagnostics. * Fix typo. * Better float testing.
Diffstat (limited to 'source/compiler-core/slang-json-lexer.cpp')
-rw-r--r--source/compiler-core/slang-json-lexer.cpp385
1 files changed, 385 insertions, 0 deletions
diff --git a/source/compiler-core/slang-json-lexer.cpp b/source/compiler-core/slang-json-lexer.cpp
new file mode 100644
index 000000000..19a5b29a7
--- /dev/null
+++ b/source/compiler-core/slang-json-lexer.cpp
@@ -0,0 +1,385 @@
+// slang-json-lexer.cpp
+#include "slang-json-lexer.h"
+
+#include "slang-json-diagnostics.h"
+#include "../core/slang-char-util.h"
+
+/*
+https://www.json.org/json-en.html
+*/
+
+namespace Slang {
+
+SlangResult JSONLexer::init(SourceView* sourceView, DiagnosticSink* sink)
+{
+ m_sourceView = sourceView;
+ m_sink = sink;
+
+ SourceFile* sourceFile = sourceView->getSourceFile();
+
+ // Note that the content must be null terminated (because of other requirements)
+ SLANG_ASSERT(sourceFile && sourceFile->hasContent());
+
+ m_contentStart = sourceFile->getContent().begin();
+
+ m_startLoc = sourceView->getRange().begin;
+
+ m_lexemeStart = m_contentStart;
+ m_cursor = m_lexemeStart;
+
+ // We need to prime the first token
+ advance();
+
+ return SLANG_OK;
+}
+
+SLANG_FORCE_INLINE static const char* _handleEndOfLine(char c, const char* cursor)
+{
+ SLANG_ASSERT(c == '\n' || c == '\r');
+ const char d = *cursor;
+ return cursor + Index((c ^ d) == ('\n' ^ '\r'));
+}
+
+JSONTokenType JSONLexer::_setInvalidToken()
+{
+ return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+}
+
+JSONTokenType JSONLexer::advance()
+{
+ const char* cursor = m_cursor;
+
+ while (true)
+ {
+ m_lexemeStart = cursor;
+
+ const char c = *cursor++;
+
+ switch (c)
+ {
+ case 0: return _setToken(JSONTokenType::EndOfFile, cursor - 1);
+ case '"':
+ {
+ cursor = _lexString(cursor);
+ if (cursor == nullptr)
+ {
+ return _setInvalidToken();
+ }
+ return _setToken(JSONTokenType::StringLiteral, cursor);
+ }
+ case '/':
+ {
+ // We allow comments
+ const char nextChar = *m_cursor;
+
+ if (nextChar == '/')
+ {
+ // Line comment
+ cursor = _lexLineComment(cursor);
+ break;
+ }
+ else if (nextChar == '*')
+ {
+ cursor = _lexBlockComment(cursor);
+ // Can fail...
+ if (cursor == nullptr)
+ {
+ return _setInvalidToken();
+ }
+ break;
+ }
+ }
+ case ' ':
+ case '\t':
+ case '\n':
+ case '\r':
+ {
+ cursor = _lexWhitespace(cursor);
+ break;
+ }
+ case ':': return _setToken(JSONTokenType::Colon, cursor);
+ case ',': return _setToken(JSONTokenType::Comma, cursor);
+ case '[': return _setToken(JSONTokenType::LBracket, cursor);
+ case ']': return _setToken(JSONTokenType::RBracket, cursor);
+ case '{': return _setToken(JSONTokenType::LBrace, cursor);
+ case '}': return _setToken(JSONTokenType::RBrace, cursor);
+
+ case '-':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ {
+ LexResult res = _lexNumber(cursor - 1);
+ if (res.cursor == nullptr)
+ {
+ return _setToken(JSONTokenType::Invalid, m_lexemeStart);
+ }
+ return _setToken(res.type, res.cursor);
+ }
+ case 't':
+ {
+ if (cursor[0] == 'r' && cursor[1] == 'u' && cursor[2] == 'e')
+ {
+ return _setToken(JSONTokenType::True, cursor + 3);
+ }
+ m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+ return _setInvalidToken();
+ }
+ case 'f':
+ {
+ if (cursor[0] == 'a' && cursor[1] == 'l' && cursor[2] == 's' && cursor[3] == 'e')
+ {
+ return _setToken(JSONTokenType::False, cursor + 4);
+ }
+ m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+ return _setInvalidToken();
+ }
+ case 'n':
+ {
+ if (cursor[0] == 'u' && cursor[1] == 'l' && cursor[2] == 'l')
+ {
+ return _setToken(JSONTokenType::Null, cursor + 3);
+ }
+ m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::expectingValueName);
+ return _setInvalidToken();
+ }
+ default:
+ {
+ StringBuilder buf;
+ if (c <= ' ' || c >= 0x7e)
+ {
+ static const char s_hex[] = "012345679abcdef";
+
+ char hexBuf[5] = "0x";
+
+ uint32_t value = c;
+ hexBuf[2] = s_hex[((value >> 4) & 0xf)];
+ hexBuf[3] = s_hex[(value & 0xf)];
+ hexBuf[4] = 0;
+
+ buf << hexBuf;
+ }
+ else
+ {
+ buf << c;
+ }
+
+ m_sink->diagnose(_getLoc(m_lexemeStart), JSONDiagnostics::unexpectedCharacter);
+ return _setInvalidToken();
+ }
+ }
+ }
+}
+
+JSONLexer::LexResult JSONLexer::_lexNumber(const char* cursor)
+{
+ JSONTokenType tokenType = JSONTokenType::IntegerLiteral;
+
+ if (*cursor == '-')
+ {
+ cursor++;
+ }
+
+ if (*cursor == '0')
+ {
+ // Can only be followed by . exponent, or nothing
+ cursor++;
+ }
+ else if (*cursor >= '1' && *cursor <= '9')
+ {
+ cursor++;
+ while (CharUtil::isDigit(*cursor))
+ {
+ cursor++;
+ }
+ }
+
+ // Theres a fraction
+ if (*cursor == '.')
+ {
+ tokenType = JSONTokenType::FloatLiteral;
+ // Skip the dot
+ cursor++;
+ // Must have at least one digit
+ if (!CharUtil::isDigit(*cursor))
+ {
+ m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+ return LexResult{ JSONTokenType::Invalid, nullptr };
+ }
+ // Skip the digit
+ cursor++;
+ // Skip any more digits
+ while (CharUtil::isDigit(*cursor)) cursor++;
+ }
+
+ // Theres an exponent
+ if (*cursor == 'e' || *cursor == 'E')
+ {
+ tokenType = JSONTokenType::FloatLiteral;
+
+ // Has an exponent
+ cursor++;
+
+ // Skip +/- if has one
+ if (*cursor == '+' || *cursor == '-')
+ {
+ cursor++;
+ }
+
+ // Must have one digit
+ if (!CharUtil::isDigit(*cursor))
+ {
+ m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingADigit);
+ return LexResult{ JSONTokenType::Invalid, nullptr };
+ }
+
+ // Skip the digit
+ cursor++;
+ // Skip any more digits
+ while (CharUtil::isDigit(*cursor)) cursor++;
+ }
+
+ return LexResult{tokenType, cursor};
+}
+
+const char* JSONLexer::_lexString(const char* cursor)
+{
+ // We've skipped the first "
+ while (true)
+ {
+ const char c = *cursor++;
+
+ switch (c)
+ {
+ case 0:
+ {
+ m_sink->diagnose(_getLoc(cursor - 1), JSONDiagnostics::endOfFileInLiteral);
+ return nullptr;
+ }
+ case '"':
+ {
+ return cursor;
+ }
+ case '\\':
+ {
+ const char nextC = *cursor;
+ switch (nextC)
+ {
+ case '"':
+ case '\\':
+ case '/':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ {
+ ++cursor;
+ break;
+ }
+ case 'u':
+ {
+ cursor++;
+ for (Index i = 0; i < 4; ++i)
+ {
+ if (!CharUtil::isHexDigit(cursor[i]))
+ {
+ m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::expectingAHexDigit);
+ return nullptr;
+ }
+ }
+ cursor += 4;
+ break;
+ }
+ }
+
+ }
+ // Somewhat surprisingly it appears it's valid to have \r\n inside of quotes.
+ default: break;
+ }
+ }
+}
+
+const char* JSONLexer::_lexLineComment(const char* cursor)
+{
+ for (;;)
+ {
+ const char c = *cursor++;
+
+ switch (c)
+ {
+ case '\n':
+ case '\r':
+ {
+ // We need to skip to the next line
+ return _handleEndOfLine(c, cursor);
+ }
+ case 0:
+ {
+ return cursor - 1;
+ }
+ }
+ }
+}
+
+const char* JSONLexer::_lexBlockComment(const char* cursor)
+{
+ for (;;)
+ {
+ const char c = *cursor++;
+ switch (c)
+ {
+ case 0:
+ {
+ m_sink->diagnose(_getLoc(cursor), JSONDiagnostics::endOfFileInComment);
+ return nullptr;
+ }
+ case '*':
+ {
+ if (*cursor == '/')
+ {
+ return cursor + 1;
+ }
+ break;
+ }
+ default: break;
+ }
+ }
+}
+
+const char* JSONLexer::_lexWhitespace(const char* cursor)
+{
+ while (true)
+ {
+ const char c = *cursor;
+
+ // Might want to use CharUtil::isWhitespace...
+
+ switch (c)
+ {
+ case ' ':
+ case '\n':
+ case '\r':
+ case '\t':
+ {
+ cursor++;
+ break;
+ }
+ default:
+ {
+ // Hit non white space
+ return cursor;
+ }
+ }
+
+ }
+}
+
+} // namespace Slang