summaryrefslogtreecommitdiffstats
path: root/source/core/slang-token-reader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'source/core/slang-token-reader.cpp')
-rw-r--r--source/core/slang-token-reader.cpp768
1 files changed, 768 insertions, 0 deletions
diff --git a/source/core/slang-token-reader.cpp b/source/core/slang-token-reader.cpp
new file mode 100644
index 000000000..a15dcda9c
--- /dev/null
+++ b/source/core/slang-token-reader.cpp
@@ -0,0 +1,768 @@
+#include "slang-token-reader.h"
+
+namespace Slang
+{
+ enum class TokenizeErrorType
+ {
+ InvalidCharacter, InvalidEscapeSequence
+ };
+
+ enum class State
+ {
+ Start, Identifier, Operator, Int, Hex, Fixed, Double, Char, String, MultiComment, SingleComment
+ };
+
+ enum class LexDerivative
+ {
+ None, Line, File
+ };
+
+ inline bool IsLetter(char ch)
+ {
+ return ((ch >= 'a' && ch <= 'z') ||
+ (ch >= 'A' && ch <= 'Z') || ch == '_');
+ }
+
+ inline bool IsDigit(char ch)
+ {
+ return ch >= '0' && ch <= '9';
+ }
+
+ inline bool IsPunctuation(char ch)
+ {
+ return ch == '+' || ch == '-' || ch == '*' || ch == '/' || ch == '%' ||
+ ch == '!' || ch == '^' || ch == '&' || ch == '(' || ch == ')' ||
+ ch == '=' || ch == '{' || ch == '}' || ch == '[' || ch == ']' ||
+ ch == '|' || ch == ';' || ch == ',' || ch == '.' || ch == '<' ||
+ ch == '>' || ch == '~' || ch == '@' || ch == ':' || ch == '?' || ch == '#';
+ }
+
+ inline bool IsWhiteSpace(char ch)
+ {
+ return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\v');
+ }
+
+ void ParseOperators(const String & str, List<Token> & tokens, TokenFlags& tokenFlags, int line, int col, int startPos, String fileName)
+ {
+ Index pos = 0;
+ while (pos < str.getLength())
+ {
+ wchar_t curChar = str[pos];
+ wchar_t nextChar = (pos < str.getLength() - 1) ? str[pos + 1] : '\0';
+ wchar_t nextNextChar = (pos < str.getLength() - 2) ? str[pos + 2] : '\0';
+ auto InsertToken = [&](TokenType type, const String & ct)
+ {
+ tokens.add(Token(type, ct, line, int(col + pos), int(pos + startPos), fileName, tokenFlags));
+ tokenFlags = 0;
+ };
+ switch (curChar)
+ {
+ case '+':
+ if (nextChar == '+')
+ {
+ InsertToken(TokenType::OpInc, "++");
+ pos += 2;
+ }
+ else if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpAddAssign, "+=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpAdd, "+");
+ pos++;
+ }
+ break;
+ case '-':
+ if (nextChar == '-')
+ {
+ InsertToken(TokenType::OpDec, "--");
+ pos += 2;
+ }
+ else if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpSubAssign, "-=");
+ pos += 2;
+ }
+ else if (nextChar == '>')
+ {
+ InsertToken(TokenType::RightArrow, "->");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpSub, "-");
+ pos++;
+ }
+ break;
+ case '*':
+ if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpMulAssign, "*=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpMul, "*");
+ pos++;
+ }
+ break;
+ case '/':
+ if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpDivAssign, "/=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpDiv, "/");
+ pos++;
+ }
+ break;
+ case '%':
+ if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpModAssign, "%=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpMod, "%");
+ pos++;
+ }
+ break;
+ case '|':
+ if (nextChar == '|')
+ {
+ InsertToken(TokenType::OpOr, "||");
+ pos += 2;
+ }
+ else if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpOrAssign, "|=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpBitOr, "|");
+ pos++;
+ }
+ break;
+ case '&':
+ if (nextChar == '&')
+ {
+ InsertToken(TokenType::OpAnd, "&&");
+ pos += 2;
+ }
+ else if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpAndAssign, "&=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpBitAnd, "&");
+ pos++;
+ }
+ break;
+ case '^':
+ if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpXorAssign, "^=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpBitXor, "^");
+ pos++;
+ }
+ break;
+ case '>':
+ if (nextChar == '>')
+ {
+ if (nextNextChar == '=')
+ {
+ InsertToken(TokenType::OpShrAssign, ">>=");
+ pos += 3;
+ }
+ else
+ {
+ InsertToken(TokenType::OpRsh, ">>");
+ pos += 2;
+ }
+ }
+ else if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpGeq, ">=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpGreater, ">");
+ pos++;
+ }
+ break;
+ case '<':
+ if (nextChar == '<')
+ {
+ if (nextNextChar == '=')
+ {
+ InsertToken(TokenType::OpShlAssign, "<<=");
+ pos += 3;
+ }
+ else
+ {
+ InsertToken(TokenType::OpLsh, "<<");
+ pos += 2;
+ }
+ }
+ else if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpLeq, "<=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpLess, "<");
+ pos++;
+ }
+ break;
+ case '=':
+ if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpEql, "==");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpAssign, "=");
+ pos++;
+ }
+ break;
+ case '!':
+ if (nextChar == '=')
+ {
+ InsertToken(TokenType::OpNeq, "!=");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::OpNot, "!");
+ pos++;
+ }
+ break;
+ case '?':
+ InsertToken(TokenType::QuestionMark, "?");
+ pos++;
+ break;
+ case '@':
+ InsertToken(TokenType::At, "@");
+ pos++;
+ break;
+ case '#':
+ if (nextChar == '#')
+ {
+ InsertToken(TokenType::PoundPound, "##");
+ pos += 2;
+ }
+ else
+ {
+ InsertToken(TokenType::Pound, "#");
+ pos++;
+ }
+ pos++;
+ break;
+ case ':':
+ InsertToken(TokenType::Colon, ":");
+ pos++;
+ break;
+ case '~':
+ InsertToken(TokenType::OpBitNot, "~");
+ pos++;
+ break;
+ case ';':
+ InsertToken(TokenType::Semicolon, ";");
+ pos++;
+ break;
+ case ',':
+ InsertToken(TokenType::Comma, ",");
+ pos++;
+ break;
+ case '.':
+ InsertToken(TokenType::Dot, ".");
+ pos++;
+ break;
+ case '{':
+ InsertToken(TokenType::LBrace, "{");
+ pos++;
+ break;
+ case '}':
+ InsertToken(TokenType::RBrace, "}");
+ pos++;
+ break;
+ case '[':
+ InsertToken(TokenType::LBracket, "[");
+ pos++;
+ break;
+ case ']':
+ InsertToken(TokenType::RBracket, "]");
+ pos++;
+ break;
+ case '(':
+ InsertToken(TokenType::LParent, "(");
+ pos++;
+ break;
+ case ')':
+ InsertToken(TokenType::RParent, ")");
+ pos++;
+ break;
+ }
+ }
+ }
+
+ List<Token> TokenizeText(const String & fileName, const String & text)
+ {
+ Index lastPos = 0, pos = 0;
+ int line = 1, col = 0;
+ String file = fileName;
+ State state = State::Start;
+ StringBuilder tokenBuilder;
+ int tokenLine, tokenCol;
+ List<Token> tokenList;
+ LexDerivative derivative = LexDerivative::None;
+ TokenFlags tokenFlags = TokenFlag::AtStartOfLine;
+ auto InsertToken = [&](TokenType type)
+ {
+ derivative = LexDerivative::None;
+ tokenList.add(Token(type, tokenBuilder.ToString(), tokenLine, tokenCol, int(pos), file, tokenFlags));
+ tokenFlags = 0;
+ tokenBuilder.Clear();
+ };
+ auto ProcessTransferChar = [&](char nextChar)
+ {
+ switch (nextChar)
+ {
+ case '\\':
+ case '\"':
+ case '\'':
+ tokenBuilder.Append(nextChar);
+ break;
+ case 't':
+ tokenBuilder.Append('\t');
+ break;
+ case 's':
+ tokenBuilder.Append(' ');
+ break;
+ case 'n':
+ tokenBuilder.Append('\n');
+ break;
+ case 'r':
+ tokenBuilder.Append('\r');
+ break;
+ case 'b':
+ tokenBuilder.Append('\b');
+ break;
+ }
+ };
+ while (pos <= text.getLength())
+ {
+ char curChar = (pos < text.getLength() ? text[pos] : ' ');
+ char nextChar = (pos < text.getLength() - 1) ? text[pos + 1] : '\0';
+ if (lastPos != pos)
+ {
+ if (curChar == '\n')
+ {
+ line++;
+ col = 0;
+ }
+ else
+ col++;
+ lastPos = pos;
+ }
+
+ switch (state)
+ {
+ case State::Start:
+ if (IsLetter(curChar))
+ {
+ state = State::Identifier;
+ tokenLine = line;
+ tokenCol = col;
+ }
+ else if (IsDigit(curChar))
+ {
+ state = State::Int;
+ tokenLine = line;
+ tokenCol = col;
+ }
+ else if (curChar == '\'')
+ {
+ state = State::Char;
+ pos++;
+ tokenLine = line;
+ tokenCol = col;
+ }
+ else if (curChar == '"')
+ {
+ state = State::String;
+ pos++;
+ tokenLine = line;
+ tokenCol = col;
+ }
+ else if (curChar == '\r' || curChar == '\n')
+ {
+ tokenFlags |= TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
+ pos++;
+ }
+ else if (curChar == ' ' || curChar == '\t' || curChar == -62 || curChar == -96) // -62/-96:non-break space
+ {
+ tokenFlags |= TokenFlag::AfterWhitespace;
+ pos++;
+ }
+ else if (curChar == '/' && nextChar == '/')
+ {
+ state = State::SingleComment;
+ pos += 2;
+ }
+ else if (curChar == '/' && nextChar == '*')
+ {
+ pos += 2;
+ state = State::MultiComment;
+ }
+ else if (curChar == '.' && IsDigit(nextChar))
+ {
+ tokenBuilder.Append("0.");
+ state = State::Fixed;
+ pos++;
+ }
+ else if (IsPunctuation(curChar))
+ {
+ state = State::Operator;
+ tokenLine = line;
+ tokenCol = col;
+ }
+ else
+ {
+ pos++;
+ }
+ break;
+ case State::Identifier:
+ if (IsLetter(curChar) || IsDigit(curChar))
+ {
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else
+ {
+ auto tokenStr = tokenBuilder.ToString();
+#if 0
+ if (tokenStr == "#line_reset#")
+ {
+ line = 0;
+ col = 0;
+ tokenBuilder.Clear();
+ }
+ else if (tokenStr == "#line")
+ {
+ derivative = LexDerivative::Line;
+ tokenBuilder.Clear();
+ }
+ else if (tokenStr == "#file")
+ {
+ derivative = LexDerivative::File;
+ tokenBuilder.Clear();
+ line = 0;
+ col = 0;
+ }
+ else
+#endif
+ InsertToken(TokenType::Identifier);
+ state = State::Start;
+ }
+ break;
+ case State::Operator:
+ if (IsPunctuation(curChar) && !((curChar == '/' && nextChar == '/') || (curChar == '/' && nextChar == '*')))
+ {
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else
+ {
+ //do token analyze
+ ParseOperators(tokenBuilder.ToString(), tokenList, tokenFlags, tokenLine, tokenCol, (int)(pos - tokenBuilder.getLength()), file);
+ tokenBuilder.Clear();
+ state = State::Start;
+ }
+ break;
+ case State::Int:
+ if (IsDigit(curChar))
+ {
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else if (curChar == '.')
+ {
+ state = State::Fixed;
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else if (curChar == 'e' || curChar == 'E')
+ {
+ state = State::Double;
+ tokenBuilder.Append(curChar);
+ if (nextChar == '-' || nextChar == '+')
+ {
+ tokenBuilder.Append(nextChar);
+ pos++;
+ }
+ pos++;
+ }
+ else if (curChar == 'x')
+ {
+ state = State::Hex;
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else if (curChar == 'u')
+ {
+ pos++;
+ tokenBuilder.Append(curChar);
+ InsertToken(TokenType::IntLiteral);
+ state = State::Start;
+ }
+ else
+ {
+ if (derivative == LexDerivative::Line)
+ {
+ derivative = LexDerivative::None;
+ line = StringToInt(tokenBuilder.ToString()) - 1;
+ col = 0;
+ tokenBuilder.Clear();
+ }
+ else
+ {
+ InsertToken(TokenType::IntLiteral);
+ }
+ state = State::Start;
+ }
+ break;
+ case State::Hex:
+ if (IsDigit(curChar) || (curChar >= 'a' && curChar <= 'f') || (curChar >= 'A' && curChar <= 'F'))
+ {
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else
+ {
+ InsertToken(TokenType::IntLiteral);
+ state = State::Start;
+ }
+ break;
+ case State::Fixed:
+ if (IsDigit(curChar))
+ {
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else if (curChar == 'e' || curChar == 'E')
+ {
+ state = State::Double;
+ tokenBuilder.Append(curChar);
+ if (nextChar == '-' || nextChar == '+')
+ {
+ tokenBuilder.Append(nextChar);
+ pos++;
+ }
+ pos++;
+ }
+ else
+ {
+ if (curChar == 'f')
+ pos++;
+ InsertToken(TokenType::DoubleLiteral);
+ state = State::Start;
+ }
+ break;
+ case State::Double:
+ if (IsDigit(curChar))
+ {
+ tokenBuilder.Append(curChar);
+ pos++;
+ }
+ else
+ {
+ if (curChar == 'f')
+ pos++;
+ InsertToken(TokenType::DoubleLiteral);
+ state = State::Start;
+ }
+ break;
+ case State::String:
+ if (curChar != '"')
+ {
+ if (curChar == '\\')
+ {
+ ProcessTransferChar(nextChar);
+ pos++;
+ }
+ else
+ tokenBuilder.Append(curChar);
+ }
+ else
+ {
+ if (derivative == LexDerivative::File)
+ {
+ derivative = LexDerivative::None;
+ file = tokenBuilder.ToString();
+ tokenBuilder.Clear();
+ }
+ else
+ {
+ InsertToken(TokenType::StringLiteral);
+ }
+ state = State::Start;
+ }
+ pos++;
+ break;
+ case State::Char:
+ if (curChar != '\'')
+ {
+ if (curChar == '\\')
+ {
+ ProcessTransferChar(nextChar);
+ pos++;
+ }
+ else
+ tokenBuilder.Append(curChar);
+ }
+ else
+ {
+ InsertToken(TokenType::CharLiteral);
+ state = State::Start;
+ }
+ pos++;
+ break;
+ case State::SingleComment:
+ if (curChar == '\n')
+ {
+ state = State::Start;
+ tokenFlags |= TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
+ }
+ pos++;
+ break;
+ case State::MultiComment:
+ if (curChar == '*' && nextChar == '/')
+ {
+ state = State::Start;
+ tokenFlags |= TokenFlag::AfterWhitespace;
+ pos += 2;
+ }
+ else
+ pos++;
+ break;
+ }
+ }
+ return tokenList;
+ }
+ List<Token> TokenizeText(const String & text)
+ {
+ return TokenizeText("", text);
+ }
+
+ String EscapeStringLiteral(String str)
+ {
+ StringBuilder sb;
+ sb << "\"";
+ const Index length = str.getLength();
+ const char*const data = str.getBuffer();
+ for (Index i = 0; i < length; i++)
+ {
+ switch (data[i])
+ {
+ case ' ':
+ sb << "\\s";
+ break;
+ case '\n':
+ sb << "\\n";
+ break;
+ case '\r':
+ sb << "\\r";
+ break;
+ case '\t':
+ sb << "\\t";
+ break;
+ case '\v':
+ sb << "\\v";
+ break;
+ case '\'':
+ sb << "\\\'";
+ break;
+ case '\"':
+ sb << "\\\"";
+ break;
+ case '\\':
+ sb << "\\\\";
+ break;
+ default:
+ sb << data[i];
+ break;
+ }
+ }
+ sb << "\"";
+ return sb.ProduceString();
+ }
+
+ String UnescapeStringLiteral(String str)
+ {
+ StringBuilder sb;
+ const Index length = str.getLength();
+ const char*const data = str.getBuffer();
+ for (Index i = 0; i < length; i++)
+ {
+ if (data[i] == '\\' && i < length - 1)
+ {
+ switch (data[i + 1])
+ {
+ case 's':
+ sb << " ";
+ break;
+ case 't':
+ sb << '\t';
+ break;
+ case 'n':
+ sb << '\n';
+ break;
+ case 'r':
+ sb << '\r';
+ break;
+ case 'v':
+ sb << '\v';
+ break;
+ case '\'':
+ sb << '\'';
+ break;
+ case '\"':
+ sb << "\"";
+ break;
+ case '\\':
+ sb << "\\";
+ break;
+ default:
+ i = i - 1;
+ sb << data[i];
+ }
+ i++;
+ }
+ else
+ sb << data[i];
+ }
+ return sb.ProduceString();
+ }
+
+ TokenReader::TokenReader(String text)
+ {
+ this->tokens = TokenizeText("", text);
+ tokenPtr = 0;
+ }
+}