summaryrefslogtreecommitdiffstats
path: root/source/core
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2022-05-27 17:28:05 -0400
committerGitHub <noreply@github.com>2022-05-27 17:28:05 -0400
commit2d3392f22c894957d17dd13486e0565c4ecea89c (patch)
treece4dadbd85a59e52725fa6f92613553cd5b29859 /source/core
parentabb89b3e460e11e8f9a134199c2d559190bfc47e (diff)
Added NativeStringType (#2252)
* #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead.
Diffstat (limited to 'source/core')
-rw-r--r--source/core/slang-char-util.h31
-rw-r--r--source/core/slang-hex-dump-util.cpp23
-rw-r--r--source/core/slang-string-escape-util.cpp195
-rw-r--r--source/core/slang-string-escape-util.h2
-rw-r--r--source/core/slang-string.h66
-rw-r--r--source/core/slang-token-reader.cpp87
-rw-r--r--source/core/slang-token-reader.h2
-rw-r--r--source/core/slang-type-text-util.cpp2
8 files changed, 226 insertions, 182 deletions
diff --git a/source/core/slang-char-util.h b/source/core/slang-char-util.h
index 8f7f69c90..f831f6d55 100644
--- a/source/core/slang-char-util.h
+++ b/source/core/slang-char-util.h
@@ -31,8 +31,12 @@ struct CharUtil
/// True if it's alpha
SLANG_FORCE_INLINE static bool isAlpha(char c) { return (getFlags(c) & (Flag::Upper | Flag::Lower)) != 0; }
+ /// True if the character is a valid hex character
SLANG_FORCE_INLINE static bool isHexDigit(char c) { return (getFlags(c) & Flag::HexDigit) != 0; }
+ /// True if the character is an octal digit
+ SLANG_FORCE_INLINE static bool isOctalDigit(char c) { return c >= '0' && c <= '7'; }
+
/// For a given character get the associated flags
SLANG_FORCE_INLINE static Flags getFlags(char c) { return g_charFlagMap.flags[size_t(c)]; }
@@ -41,7 +45,14 @@ struct CharUtil
/// Given a character return the upper case equivalent
SLANG_FORCE_INLINE static char toUpper(char c) { return (c >= 'a' && c <= 'z') ? (c -'a' + 'A') : c; }
-
+ /// Returns the value if c interpretted as a hex digit
+ /// If c is not a valid hex returns -1
+ inline static int getHexDigitValue(char c);
+
+ /// Returns the value if c interpretted as a octal digit
+ /// If c is not a valid octal returns -1
+ inline static int getOctalDigitValue(char c) { return isOctalDigit(c) ? (c - '0') : -1; }
+
struct CharFlagMap
{
Flags flags[0x100];
@@ -57,6 +68,24 @@ struct CharUtil
static const CharFlagMap g_charFlagMap;
};
+// ------------------------------------------------------------------------------------
+inline /* static */int CharUtil::getHexDigitValue(char c)
+{
+ if (c >= '0' && c <= '9')
+ {
+ return c - '0';
+ }
+ else if (c >= 'a' && c <= 'f')
+ {
+ return c - 'a' + 10;
+ }
+ else if (c >= 'A' && c <= 'F')
+ {
+ return c - 'A' + 10;
+ }
+ return -1;
+}
+
} // namespace Slang
#endif // SLANG_CHAR_UTIL_H
diff --git a/source/core/slang-hex-dump-util.cpp b/source/core/slang-hex-dump-util.cpp
index b493141a1..1279dc237 100644
--- a/source/core/slang-hex-dump-util.cpp
+++ b/source/core/slang-hex-dump-util.cpp
@@ -5,6 +5,8 @@
#include "slang-string-util.h"
#include "slang-writer.h"
+#include "slang-char-util.h"
+
#include "../../slang-com-helper.h"
#include "slang-hash.h"
@@ -152,23 +154,6 @@ SlangResult HexDumpUtil::dumpSourceBytes(const uint8_t* data, size_t dataCount,
return SLANG_OK;
}
-static int _parseHexDigit(char c)
-{
- if (c >= '0' && c <= '9')
- {
- return c -'0';
- }
- else if (c >= 'a' && c <= 'f')
- {
- return c - 'a' + 10;
- }
- else if (c >= 'A' && c <= 'F')
- {
- return c - 'A' + 10;
- }
- return -1;
-}
-
/* static */SlangResult HexDumpUtil::parse(const UnownedStringSlice& lines, List<uint8_t>& outBytes)
{
outBytes.clear();
@@ -188,8 +173,8 @@ static int _parseHexDigit(char c)
break;
}
- const int hi = _parseHexDigit(c);
- const int lo = _parseHexDigit(cur[1]);
+ const int hi = CharUtil::getHexDigitValue(c);
+ const int lo = CharUtil::getHexDigitValue(cur[1]);
cur += 2;
if (hi < 0 || lo < 0)
diff --git a/source/core/slang-string-escape-util.cpp b/source/core/slang-string-escape-util.cpp
index 513908c4c..334c1aae5 100644
--- a/source/core/slang-string-escape-util.cpp
+++ b/source/core/slang-string-escape-util.cpp
@@ -115,32 +115,6 @@ public:
CppStringEscapeHandler() : Super('"') {}
};
-static char _getHexChar(int v)
-{
- return (v <= 9) ? char(v + '0') : char(v - 10 + 'A');
-}
-
-static int _getHexDigit(char c)
-{
- if (c >= '0' && c <= '9')
- {
- return c - '0';
- }
- else if (c >= 'a' && c <= 'f')
- {
- return c - 'a' + 10;
- }
- else if (c >= 'A' && c <= 'F')
- {
- return c - 'A' + 10;
- }
- else
- {
- SLANG_ASSERT(!"Not a hex digit");
- return 0;
- }
-}
-
static char _getCppEscapedChar(char c)
{
switch (c)
@@ -177,7 +151,6 @@ static char _getCppUnescapedChar(char c)
}
}
-
bool CppStringEscapeHandler::isUnescapingNeeeded(const UnownedStringSlice& slice)
{
return slice.indexOf('\\') >= 0;
@@ -220,6 +193,9 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
const char* cur = start;
const char*const end = slice.end();
+ // TODO(JS): A cleverer implementation might support U and u prefixing for unicode characters.
+ // For now we just stick with hex if it's not 'regular' ascii.
+
for (; cur < end; ++cur)
{
const char c = *cur;
@@ -232,6 +208,7 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
{
out.append(start, cur);
}
+
out.appendChar('\\');
out.appendChar(escapedChar);
@@ -245,17 +222,56 @@ SlangResult CppStringEscapeHandler::appendEscaped(const UnownedStringSlice& slic
out.append(start, cur);
}
- char buf[5] = "\\0x0";
+ // NOTE! There is a possible flaw around checking 'next' character (used for outputting oct and hex)
+ // If a string is constructed appended in parts, the next character is not available so the problem below can still
+ // occur.
+
+ // Another solution to this problem would be to output "", but that makes some other assumptions
+ // For example Slang doesn't support that style.
+
+ // C++ greedily consumes hex/octal digits. This is a problem if we have bytes
+ // 0, '1' as by default this will output as
+ // "\x001" which is the single character byte 1.
+
+ // Note this claims \x is followed with up to 3 hex digits
+ // https://msdn.microsoft.com/en-us/library/69ze775t.aspx
+ // But the following claims otherwise
+ // https://en.cppreference.com/w/cpp/language/string_literal
+
+ // On testing in Visual Studio hex can indeed be more than 3 digits
+
+ // There is a problem outputting values in hex, because C++ allows *any* amount of hex digits.
+ // We could work around with \u \U but they are later extensions (C++11) and have other issue
+
+ // The solution taken here is to always output as octal, because octal can be at most 3 digits.
+
+ // Special case handling of 0
+ if (c == 0 && !(cur + 1 < end && CharUtil::isOctalDigit(cur[1])))
+ {
+ // We can just output as (octal) "\0"
+ out.append("\\0");
+ }
+ else
+ {
+ // A slightly more sophisticated implementation could output less digits if needed, if not followed by an octal
+ // digit, but for now we go simple and output all 3 digits
+
+ const uint32_t v = uint32_t(c);
- buf[3] = _getHexChar((int(c) >> 4) & 0xf);
- buf[4] = _getHexChar(c & 0xf);
+ char buf[4];
+ buf[0] = '\\';
+ buf[1] = ((v >> 6) & 3) + '0';
+ buf[2] = ((v >> 3) & 7) + '0';
+ buf[3] = ((v >> 0) & 7) + '0';
- out.append(buf, buf + 4);
+ out.append(buf, buf + 4);
+ }
start = cur + 1;
}
}
+ // Flush anything remaining
if (start < end)
{
out.append(start, end);
@@ -269,16 +285,16 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
const char* cur = start;
const char*const end = slice.end();
- for (; cur < end; ++cur)
+ while (cur < end)
{
const char c = *cur;
if (c == '\\')
{
// Flush
- if (start < end)
+ if (start < cur)
{
- out.append(start, end);
+ out.append(start, cur);
}
/// Next
@@ -286,11 +302,14 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
if (cur >= end)
{
+ // Missing character following '\'
return SLANG_FAIL;
}
+ const char nextC = *cur++;
+
// Need to handle various escape sequence cases
- switch (*cur)
+ switch (nextC)
{
case '\'':
case '\"':
@@ -304,7 +323,7 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
case 't':
case 'v':
{
- const char unescapedChar = _getCppUnescapedChar(*cur);
+ const char unescapedChar = _getCppUnescapedChar(nextC);
if (unescapedChar == 0)
{
// Don't know how to unescape that char
@@ -312,14 +331,18 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
}
out.appendChar(unescapedChar);
- start = cur + 1;
+ start = cur;
break;
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
{
+ // Rewind back a character, as first digit is the 'nextC'
+ --cur;
+
+ // Don't need to check for enough characters, because there must be 1 - the nextC
+
// octal escape: up to 3 characters
- ++cur;
int value = 0;
const char* octEnd = cur + 3;
@@ -327,33 +350,99 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
for (; cur < octEnd; ++cur)
{
- const char d = *cur;
- if (d >= '0' && d <= '7')
+ const int digitValue = CharUtil::getOctalDigitValue(*cur);
+ if (digitValue < 0)
{
- value = (value << 3) | (d - '0');
+ break;
}
+ value = (value << 3) | digitValue;
}
out.appendChar(char(value));
+ // Reset start
start = cur;
break;
}
case 'x':
{
+ /// In the C++ standard we consume hex digits until we hit a non hex digit
+ uint32_t value = 0;
+ for (; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+ {
+ const int digitValue = CharUtil::getHexDigitValue(*cur);
+ if (digitValue < 0)
+ {
+ return SLANG_FAIL;
+ }
+
+ value = (value << 4) | digitValue;
+ }
+
+ // If it's ascii, just output it
+ if (value < 0x80)
+ {
+ out.appendChar(char(value));
+ }
+ else
+ {
+ // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+ // but 6 are possible, so lets go large.
+ const Index maxUtf8EncodeCount = 6;
+
+ char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+ int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+ out.appendInPlace(chars, numChars);
+ }
+
+ // Reset start
+ start = cur;
+ break;
+ }
+ case 'u':
+ case 'U':
+ {
+ // u implies 4 hex digits
+ // U implies 6.
+
+ // Work out how many digits we need
+ const Count digitCount = (nextC == 'u') ? 4 : 6;
+
+ // Do we have enough?
+ if (end - cur < digitCount)
+ {
+ return SLANG_FAIL;
+ }
+
uint32_t value = 0;
- for (++cur; cur < end && CharUtil::isHexDigit(*cur); ++cur)
+ for (Index i = 0; i < digitCount; ++i)
{
- value = value << 4 | _getHexDigit(*cur);
+ const int digitValue = CharUtil::getHexDigitValue(cur[i]);
+ if (digitValue < 0)
+ {
+ return SLANG_FAIL;
+ }
+ value = (value << 4) | digitValue;
}
+ cur += digitCount;
- // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
- // but 6 are possible, so lets go large.
- const Index maxUtf8EncodeCount = 6;
+ // Encode to Utf8
+ // If it's ascii, just output it
+ if (value < 0x80)
+ {
+ out.appendChar(char(value));
+ }
+ else
+ {
+ // It's arguable what is appropriate. We only decode/encode 4, which the current spec has,
+ // but 6 are possible, so lets go large.
+ const Index maxUtf8EncodeCount = 6;
- char* chars = out.prepareForAppend(maxUtf8EncodeCount);
- int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
- out.appendInPlace(chars, numChars);
+ char* chars = out.prepareForAppend(maxUtf8EncodeCount);
+ int numChars = encodeUnicodePointToUTF8(Char32(value), chars);
+ out.appendInPlace(chars, numChars);
+ }
+ // Reset start
start = cur;
break;
}
@@ -363,6 +452,11 @@ SlangResult CppStringEscapeHandler::appendUnescaped(const UnownedStringSlice& sl
}
}
}
+ else
+ {
+ // Next char
+ ++cur;
+ }
}
if (start < end)
@@ -850,6 +944,9 @@ StringEscapeUtil::Handler* StringEscapeUtil::getHandler(Style style)
case Style::Cpp: return &g_cppHandler;
case Style::Space: return &g_spaceHandler;
case Style::JSON: return &g_jsonHandler;
+ // TODO(JS): For now we make Slang language string encoding/decoding the same as C++
+ // That may not be desirable because C++ has a variety of surprising edge cases (for example around \x)
+ case Style::Slang: return &g_cppHandler;
default: return nullptr;
}
}
diff --git a/source/core/slang-string-escape-util.h b/source/core/slang-string-escape-util.h
index 5f749c423..c3a43b623 100644
--- a/source/core/slang-string-escape-util.h
+++ b/source/core/slang-string-escape-util.h
@@ -55,6 +55,8 @@ struct StringEscapeUtil
Cpp, ///< Cpp style quoting and escape handling
Space, ///< Applies quotes if there are spaces. Does not escape.
JSON, ///< Json encoding
+ Slang, ///< Slang style string encoding (For now same as Cpp but that may change in the future)
+ CountOf,
};
/// Given a style returns a handler
diff --git a/source/core/slang-string.h b/source/core/slang-string.h
index 6dc3275eb..5119dac8f 100644
--- a/source/core/slang-string.h
+++ b/source/core/slang-string.h
@@ -62,6 +62,10 @@ namespace Slang
return (((unsigned char)ch) & 0xC0) == 0x80;
}
+ /* A string slice that doesn't own the contained characters.
+ It is the responsibility of code using the type to keep the memory backing
+ the slice in scope.
+ A slice is generally *not* zero terminated. */
struct SLANG_RT_API UnownedStringSlice
{
public:
@@ -85,15 +89,9 @@ namespace Slang
, m_end(b + len)
{}
- char const* begin() const
- {
- return m_begin;
- }
+ SLANG_FORCE_INLINE char const* begin() const { return m_begin; }
- char const* end() const
- {
- return m_end;
- }
+ SLANG_FORCE_INLINE char const* end() const { return m_end; }
/// True if slice is strictly contained in memory.
bool isMemoryContained(const UnownedStringSlice& slice) const
@@ -105,10 +103,8 @@ namespace Slang
return pos >= m_begin && pos <= m_end;
}
- Index getLength() const
- {
- return Index(m_end - m_begin);
- }
+ /// Get the length in *bytes*
+ Count getLength() const { return Index(m_end - m_begin); }
/// Finds first index of char 'c'. If not found returns -1.
Index indexOf(char c) const;
@@ -179,7 +175,7 @@ namespace Slang
template <size_t SIZE>
SLANG_FORCE_INLINE static UnownedStringSlice fromLiteral(const char (&in)[SIZE]) { return UnownedStringSlice(in, SIZE - 1); }
- private:
+ protected:
char const* m_begin;
char const* m_end;
};
@@ -188,6 +184,40 @@ namespace Slang
template <size_t SIZE>
SLANG_FORCE_INLINE UnownedStringSlice toSlice(const char (&in)[SIZE]) { return UnownedStringSlice(in, SIZE - 1); }
+ /// Same as UnownedStringSlice, but must be zero terminated.
+ /// Zero termination is *not* included in the length.
+ struct SLANG_RT_API UnownedTerminatedStringSlice : public UnownedStringSlice
+ {
+ public:
+ typedef UnownedStringSlice Super;
+ typedef UnownedTerminatedStringSlice ThisType;
+
+ /// We can turn into a regular zero terminated string
+ SLANG_FORCE_INLINE operator const char*() const { return m_begin; }
+
+ /// Exists to match the equivalent function in String.
+ SLANG_FORCE_INLINE char const* getBuffer() const { return m_begin; }
+
+ /// Construct from a literal directly.
+ template <size_t SIZE>
+ SLANG_FORCE_INLINE static ThisType fromLiteral(const char(&in)[SIZE]) { return ThisType(in, SIZE - 1); }
+
+ /// Note, b cannot be null because if it were then the string would not be null terminated
+ UnownedTerminatedStringSlice(char const* b)
+ : Super(b, b + strlen(b))
+ {}
+ UnownedTerminatedStringSlice(char const* b, size_t len)
+ : Super(b, len)
+ {
+ // b must be valid and it must be null terminated
+ SLANG_ASSERT(b && b[len] == 0);
+ }
+ };
+
+ // A more convenient way to make terminated slices from *string literals*
+ template <size_t SIZE>
+ SLANG_FORCE_INLINE UnownedTerminatedStringSlice toTerminatedSlice(const char(&in)[SIZE]) { return UnownedTerminatedStringSlice(in, SIZE - 1); }
+
// A `StringRepresentation` provides the backing storage for
// all reference-counted string-related types.
class SLANG_RT_API StringRepresentation : public RefObject
@@ -284,16 +314,6 @@ namespace Slang
class String;
-
-
- struct SLANG_RT_API UnownedTerminatedStringSlice : public UnownedStringSlice
- {
- public:
- UnownedTerminatedStringSlice(char const* b)
- : UnownedStringSlice(b, b + (b?strlen(b):0))
- {}
- };
-
struct SLANG_RT_API StringSlice
{
public:
diff --git a/source/core/slang-token-reader.cpp b/source/core/slang-token-reader.cpp
index 7ffbc12fa..5acc1736c 100644
--- a/source/core/slang-token-reader.cpp
+++ b/source/core/slang-token-reader.cpp
@@ -671,93 +671,6 @@ namespace Misc {
return TokenizeText("", text);
}
- String EscapeStringLiteral(String str)
- {
- StringBuilder sb;
- sb << "\"";
- const Index length = str.getLength();
- const char*const data = str.getBuffer();
- for (Index i = 0; i < length; i++)
- {
- switch (data[i])
- {
- case '\n':
- sb << "\\n";
- break;
- case '\r':
- sb << "\\r";
- break;
- case '\t':
- sb << "\\t";
- break;
- case '\v':
- sb << "\\v";
- break;
- case '\'':
- sb << "\\\'";
- break;
- case '\"':
- sb << "\\\"";
- break;
- case '\\':
- sb << "\\\\";
- break;
- default:
- sb << data[i];
- break;
- }
- }
- sb << "\"";
- return sb.ProduceString();
- }
-
- String UnescapeStringLiteral(String str)
- {
- StringBuilder sb;
- const Index length = str.getLength();
- const char*const data = str.getBuffer();
- for (Index i = 0; i < length; i++)
- {
- if (data[i] == '\\' && i < length - 1)
- {
- switch (data[i + 1])
- {
- case 's':
- sb << " ";
- break;
- case 't':
- sb << '\t';
- break;
- case 'n':
- sb << '\n';
- break;
- case 'r':
- sb << '\r';
- break;
- case 'v':
- sb << '\v';
- break;
- case '\'':
- sb << '\'';
- break;
- case '\"':
- sb << "\"";
- break;
- case '\\':
- sb << "\\";
- break;
- default:
- i = i - 1;
- sb << data[i];
- }
- i++;
- }
- else
- sb << data[i];
- }
- return sb.ProduceString();
- }
-
TokenReader::TokenReader(String text)
{
this->tokens = TokenizeText("", text);
diff --git a/source/core/slang-token-reader.h b/source/core/slang-token-reader.h
index bf5ca4cdc..26539732c 100644
--- a/source/core/slang-token-reader.h
+++ b/source/core/slang-token-reader.h
@@ -295,8 +295,6 @@ namespace Misc {
return result;
}
- String EscapeStringLiteral(String str);
- String UnescapeStringLiteral(String str);
} // namespace Misc
} // namespace Slang
diff --git a/source/core/slang-type-text-util.cpp b/source/core/slang-type-text-util.cpp
index 13bf439ce..454ca4258 100644
--- a/source/core/slang-type-text-util.cpp
+++ b/source/core/slang-type-text-util.cpp
@@ -68,7 +68,7 @@ static const CompileTargetInfo s_compileTargetInfos[] =
{ SLANG_SPIRV_ASM, "spv.asm", "spirv-asm,spirv-assembly" },
{ SLANG_C_SOURCE, "c", "c" },
{ SLANG_CPP_SOURCE, "cpp,c++,cxx", "cpp,c++,cxx" },
- { SLANG_HOST_CPP_SOURCE, "cpp,c++,cxx", "cpp,c++,cxx"},
+ { SLANG_HOST_CPP_SOURCE, "cpp,c++,cxx", "host-cpp,host-c++,host-cxx"},
{ SLANG_HOST_EXECUTABLE,"exe", "exe,executable" },
{ SLANG_SHADER_SHARED_LIBRARY, "dll,so", "sharedlib,sharedlibrary,dll" },
{ SLANG_CUDA_SOURCE, "cu", "cuda,cu" },