From 4db6bd3cd6da1871fdac520c280bd9f933e48489 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Wed, 8 Jun 2022 19:51:49 -0400 Subject: Improved bounds checking for C++/CUDA (#2263) * #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead. * Added HostHostCallable Small changes to use ArtifactDesc/Info instead of large switches. * Fix C++ emit to handle arbitrary function export. * Add options handling for callable without an output being specified. * Can compile with COM interface. Added example using com interface. * Use the IR Ptr type instead of hack in C++ emit for interfaces. * Fix issue with outputting the COM call when ptr is used. * Fix crash issue on compilation failure. * Add support for __global. * Added `ActualGlobalRate` Added special handling around globals and COM interfaces. Tested out in cpu-com-example. * Fix typo in NodeBase. * Support for accessing globals by name working. * Bounds checking for C++ Improved bounds checks for CUDA. * Check that actual global initialization is working. * Fix typo. * Refactor the com replacement such that it doesn't need a cache or do anything special with GlobalVar. * Fix typo in CUDA prelude. * Remove context. Only create replacement if needed. * Split out COM host-callable into a unit-test. * host-callable com testing on C++and llvm. * Comment around the COM ptr replacement. * WIP Zero bound test. * Disable com test on vs 32 bit. Fix C++ prelude * Disable 32 bit targets testing com host-callable. * For now disable zero index test. * Enable bounds checking for CPU/CUDA. * Small fixes. Disable CUDA zero index bound fix. * Add test result for bound check. * Work around for index wrapping issue. * Added Fixed array test. * Only enable prelude asserts via SLANG_PRELUDE_ENABLE_ASSERT (unless defined by the user) --- prelude/slang-cpp-prelude.h | 9 +- prelude/slang-cpp-types.h | 107 ++++++++++++------- prelude/slang-cuda-prelude.h | 113 ++++++++++++--------- prelude/slang-llvm.h | 8 +- source/slang/hlsl.meta.slang | 22 ++-- source/slang/slang-compiler.cpp | 25 +++-- source/slang/slang-lower-to-ir.cpp | 3 + tests/compute/bound-check-zero-index.slang | 56 ++++++++++ .../bound-check-zero-index.slang.expected.txt | 8 ++ 9 files changed, 242 insertions(+), 109 deletions(-) create mode 100644 tests/compute/bound-check-zero-index.slang create mode 100644 tests/compute/bound-check-zero-index.slang.expected.txt diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h index ff6bb8f6f..0381a7bb6 100644 --- a/prelude/slang-cpp-prelude.h +++ b/prelude/slang-cpp-prelude.h @@ -194,7 +194,6 @@ Any platforms not detected by the above logic are now now explicitly zeroed out. # endif #endif /* SLANG_PLATFORM */ - /* Shorthands for "families" of compilers/platforms */ #define SLANG_GCC_FAMILY (SLANG_CLANG || SLANG_SNC || SLANG_GHS || SLANG_GCC) #define SLANG_WINDOWS_FAMILY (SLANG_WINRT || SLANG_WIN32 || SLANG_WIN64) @@ -249,8 +248,12 @@ convention for interface methods. # define SLANG_MCALL SLANG_STDCALL #endif +#ifndef SLANG_FORCE_INLINE +# define SLANG_FORCE_INLINE inline +#endif - +// TODO(JS): Should these be in slang-cpp-types.h? +// They are more likely to clash with slang.h struct SlangUUID { @@ -271,6 +274,8 @@ struct ISlangUnknown #endif // SLANG_H +// Includes + #include "slang-cpp-types.h" #include "slang-cpp-scalar-intrinsics.h" diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h index 64db2efb3..7aef25650 100644 --- a/prelude/slang-cpp-types.h +++ b/prelude/slang-cpp-types.h @@ -2,15 +2,48 @@ #define SLANG_PRELUDE_CPP_TYPES_H #ifndef SLANG_PRELUDE_ASSERT -# ifdef _DEBUG +# ifdef SLANG_PRELUDE_ENABLE_ASSERT # define SLANG_PRELUDE_ASSERT(VALUE) assert(VALUE) # else # define SLANG_PRELUDE_ASSERT(VALUE) # endif #endif -#ifndef SLANG_FORCE_INLINE -# define SLANG_FORCE_INLINE inline +// Since we are using unsigned arithmatic care is need in this comparison. +// It is *assumed* that sizeInBytes >= elemSize. Which means (sizeInBytes >= elemSize) >= 0 +// Which means only a single test is needed + +// Asserts for bounds checking. +// It is assumed index/count are unsigned types. +#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); +#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); + +// Macros to zero index if an access is out of range +#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; +#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; + +// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX +// the fix macro will zero the index, if out of range +#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX +# define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +#else +# define SLANG_BOUND_FIX(index, count) +# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) +#endif + +#ifndef SLANG_BOUND_CHECK +# define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) +#endif + +#ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS +# define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#endif + +#ifndef SLANG_BOUND_CHECK_FIXED_ARRAY +# define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) #endif #ifdef SLANG_PRELUDE_NAMESPACE @@ -25,8 +58,8 @@ struct TypeInfo template struct FixedArray { - const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; } - T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; } + const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } + T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } T m_data[SIZE]; }; @@ -36,8 +69,8 @@ struct FixedArray template struct Array { - const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } - T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } + const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; } T* data; size_t count; @@ -126,8 +159,8 @@ typedef size_t NonUniformResourceIndex; template struct RWStructuredBuffer { - SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } - const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } + SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); } T* data; @@ -137,8 +170,8 @@ struct RWStructuredBuffer template struct StructuredBuffer { - SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } - const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } + SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); } T* data; @@ -149,8 +182,8 @@ struct StructuredBuffer template struct RWBuffer { - SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } - const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } + SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); } T* data; @@ -160,8 +193,8 @@ struct RWBuffer template struct Buffer { - SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } - const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; } + SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); } T* data; @@ -174,32 +207,32 @@ struct ByteAddressBuffer void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } uint32_t Load(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); return data[index >> 2]; } uint2 Load2(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 8 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); const size_t dataIdx = index >> 2; return uint2{data[dataIdx], data[dataIdx + 1]}; } uint3 Load3(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 12 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); const size_t dataIdx = index >> 2; return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } uint4 Load4(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 16 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); const size_t dataIdx = index >> 2; return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template - T Load(size_t offset) const + T Load(size_t index) const { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - return *(T const*)((char*)data + offset); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + return *(const T*)(((const char*)data) + index); } const uint32_t* data; @@ -215,49 +248,49 @@ struct RWByteAddressBuffer uint32_t Load(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); return data[index >> 2]; } uint2 Load2(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 8 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); const size_t dataIdx = index >> 2; return uint2{data[dataIdx], data[dataIdx + 1]}; } uint3 Load3(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 12 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); const size_t dataIdx = index >> 2; return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } uint4 Load4(size_t index) const { - SLANG_PRELUDE_ASSERT(index + 16 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); const size_t dataIdx = index >> 2; return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template - T Load(size_t offset) const + T Load(size_t index) const { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - return *(T const*)((char*)data + offset); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + return *(const T*)(((const char*)data) + index); } void Store(size_t index, uint32_t v) const { - SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); data[index >> 2] = v; } void Store2(size_t index, uint2 v) const { - SLANG_PRELUDE_ASSERT(index + 8 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; } void Store3(size_t index, uint3 v) const - { - SLANG_PRELUDE_ASSERT(index + 12 <= sizeInBytes && (index & 3) == 0); + { + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; @@ -265,7 +298,7 @@ struct RWByteAddressBuffer } void Store4(size_t index, uint4 v) const { - SLANG_PRELUDE_ASSERT(index + 16 <= sizeInBytes && (index & 3) == 0); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; @@ -273,10 +306,10 @@ struct RWByteAddressBuffer data[dataIdx + 3] = v.w; } template - void Store(size_t offset, T const& value) const + void Store(size_t index, T const& value) const { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - *(T*)((char*)data + offset) = value; + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + *(T*)(((char*)data) + index) = value; } uint32_t* data; diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 01c658e0b..448b69c63 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -46,24 +46,44 @@ #define SLANG_FORCE_INLINE inline #define SLANG_INLINE inline -// Bound checks. Can be replaced by defining before including header. -// NOTE! -// The default behaviour, if out of bounds is to index 0. This is of course quite wrong - and different -// behavior to hlsl typically. The problem here though is more around a write reference. That unless -// some kind of proxy is used it is hard and/or slow to emulate the typical GPU behavior. - -#ifndef SLANG_CUDA_BOUND_CHECK -# define SLANG_CUDA_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); index = (index < count) ? index : 0; + +// Since we are using unsigned arithmatic care is need in this comparison. +// It is *assumed* that sizeInBytes >= elemSize. Which means (sizeInBytes >= elemSize) >= 0 +// Which means only a single test is needed + +// Asserts for bounds checking. +// It is assumed index/count are unsigned types. +#define SLANG_BOUND_ASSERT(index, count) SLANG_PRELUDE_ASSERT(index < count); +#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0); + +// Macros to zero index if an access is out of range +#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; +#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; + +// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX +// the fix macro will zero the index, if out of range +#ifdef SLANG_ENABLE_BOUND_ZERO_INDEX +# define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count) +#else +# define SLANG_BOUND_FIX(index, count) +# define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +# define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) +#endif + +#ifndef SLANG_BOUND_CHECK +# define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count) #endif -#ifndef SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK -# define SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, size, count) SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); index = (index + 4 <= sizeInBytes) ? index : 0; -#endif +#ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS +# define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) +#endif -// Here we don't have the index zeroing behavior, as such bounds checks are generally not on GPU targets either. -#ifndef SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK -# define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); +#ifndef SLANG_BOUND_CHECK_FIXED_ARRAY +# define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count) #endif + // This macro handles how out-of-range surface coordinates are handled; // I can equal // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range @@ -91,8 +111,8 @@ struct TypeInfo template struct FixedArray { - SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, SIZE); return m_data[index]; } - SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, SIZE); return m_data[index]; } + SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } + SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; } T m_data[SIZE]; }; @@ -102,8 +122,8 @@ struct FixedArray template struct Array { - SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; } - SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; } + SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; } + SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; } T* data; size_t count; @@ -714,7 +734,7 @@ struct StructuredBuffer SLANG_CUDA_CALL const T& operator[](size_t index) const { #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT - SLANG_CUDA_BOUND_CHECK(index, count); + SLANG_BOUND_CHECK(index, count); #endif return data[index]; } @@ -722,7 +742,7 @@ struct StructuredBuffer SLANG_CUDA_CALL const T& Load(size_t index) const { #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT - SLANG_CUDA_BOUND_CHECK(index, count); + SLANG_BOUND_CHECK(index, count); #endif return data[index]; } @@ -743,46 +763,44 @@ struct RWStructuredBuffer : StructuredBuffer SLANG_CUDA_CALL T& operator[](size_t index) const { #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT - SLANG_CUDA_BOUND_CHECK(index, this->count); + SLANG_BOUND_CHECK(index, this->count); #endif return this->data[index]; } }; - - // Missing Load(_In_ int Location, _Out_ uint Status); struct ByteAddressBuffer { SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); } SLANG_CUDA_CALL uint32_t Load(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); return data[index >> 2]; } SLANG_CUDA_CALL uint2 Load2(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); const size_t dataIdx = index >> 2; return uint2{data[dataIdx], data[dataIdx + 1]}; } SLANG_CUDA_CALL uint3 Load3(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); const size_t dataIdx = index >> 2; return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } SLANG_CUDA_CALL uint4 Load4(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); const size_t dataIdx = index >> 2; return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template - SLANG_CUDA_CALL T Load(size_t offset) const + SLANG_CUDA_CALL T Load(size_t index) const { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - return *(T const*)((char*)data + offset); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + return *(const T*)(((const char*)data) + index); } const uint32_t* data; @@ -798,49 +816,49 @@ struct RWByteAddressBuffer SLANG_CUDA_CALL uint32_t Load(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); return data[index >> 2]; } SLANG_CUDA_CALL uint2 Load2(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); const size_t dataIdx = index >> 2; return uint2{data[dataIdx], data[dataIdx + 1]}; } SLANG_CUDA_CALL uint3 Load3(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); const size_t dataIdx = index >> 2; return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; } SLANG_CUDA_CALL uint4 Load4(size_t index) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); const size_t dataIdx = index >> 2; return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; } template - SLANG_CUDA_CALL T Load(size_t offset) const + SLANG_CUDA_CALL T Load(size_t index) const { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - return *(T const*)((char*)data + offset); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + return *(const T*)((const char*)data + index); } SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes); data[index >> 2] = v; } SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; } SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes); const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; @@ -848,7 +866,7 @@ struct RWByteAddressBuffer } SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const { - SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes); const size_t dataIdx = index >> 2; data[dataIdx + 0] = v.x; data[dataIdx + 1] = v.y; @@ -856,17 +874,18 @@ struct RWByteAddressBuffer data[dataIdx + 3] = v.w; } template - SLANG_CUDA_CALL void Store(size_t offset, T const& value) const + SLANG_CUDA_CALL void Store(size_t index, T const& value) const { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - *(T*)((char*)data + offset) = value; + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + *(T*)(((char*)data) + index) = value; } /// Can be used in stdlib to gain access - SLANG_CUDA_CALL uint* _getPtrAt(size_t offset) + template + SLANG_CUDA_CALL T* _getPtrAt(size_t index) { - SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); - return (uint*)(((char*)data) + offset); + SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes); + return (T*)(((char*)data) + index); } uint32_t* data; diff --git a/prelude/slang-llvm.h b/prelude/slang-llvm.h index 08d6a74dd..b41380581 100644 --- a/prelude/slang-llvm.h +++ b/prelude/slang-llvm.h @@ -7,14 +7,14 @@ #define SLANG_DISABLE_EXCEPTIONS 1 #ifndef SLANG_PRELUDE_ASSERT -# ifdef DEBUG +# ifdef SLANG_PRELUDE_ENABLE_ASSERT extern "C" void assertFailure(const char* msg); # define SLANG_PRELUDE_EXPECT(VALUE, MSG) if(VALUE) {} else assertFailure("assertion failed: '" MSG "'") # define SLANG_PRELUDE_ASSERT(VALUE) SLANG_PRELUDE_EXPECT(VALUE, #VALUE) -# else // DEBUG - +# else // SLANG_PRELUDE_ENABLE_ASSERT +# define SLANG_PRELUDE_EXPECT(VALUE, MSG) # define SLANG_PRELUDE_ASSERT(x) -# endif // DEBUG +# endif // SLANG_PRELUDE_ENABLE_ASSERT #endif /* diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 7d107888a..b2f6fa06b 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -331,7 +331,7 @@ ${{{{ __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) - __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))") + __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt($1), $2))") [__requiresNVAPI] void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue); @@ -347,7 +347,7 @@ ${{{{ __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))") [__requiresNVAPI] __cuda_sm_version(2.0) - __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicAdd($0._getPtrAt($1), $2)") void InterlockedAddF32(uint byteAddress, float valueToAdd); __specialized_for_target(glsl) @@ -359,7 +359,7 @@ ${{{{ // Int64 Add __cuda_sm_version(6.0) - __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))") + __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt($1), $2))") void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue); __specialized_for_target(hlsl) @@ -377,7 +377,7 @@ ${{{{ // Without returning original value __cuda_sm_version(6.0) - __target_intrinsic(cuda, "atomicAdd((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicAdd($0._getPtrAt($1), $2)") void InterlockedAddI64(uint byteAddress, int64_t valueToAdd); __specialized_for_target(hlsl) @@ -395,7 +395,7 @@ ${{{{ // Cas uint64_t - __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))") + __target_intrinsic(cuda, "(*$4 = atomicCAS($0._getPtrAt($1), $2, $3))") void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue); __specialized_for_target(hlsl) @@ -414,7 +414,7 @@ ${{{{ // Max __cuda_sm_version(3.5) - __target_intrinsic(cuda, "atomicMax((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicMax($0._getPtrAt($1), $2)") uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value); __specialized_for_target(hlsl) @@ -430,7 +430,7 @@ ${{{{ // Min __cuda_sm_version(3.5) - __target_intrinsic(cuda, "atomicMin((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicMin($0._getPtrAt($1), $2)") uint64_t InterlockedMinU64(uint byteAddress, uint64_t value); __specialized_for_target(hlsl) @@ -445,7 +445,7 @@ ${{{{ // And - __target_intrinsic(cuda, "atomicAnd((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicAnd($0._getPtrAt($1), $2)") uint64_t InterlockedAndU64(uint byteAddress, uint64_t value); __specialized_for_target(hlsl) @@ -460,7 +460,7 @@ ${{{{ // Or - __target_intrinsic(cuda, "atomicOr((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicOr($0._getPtrAt($1), $2)") uint64_t InterlockedOrU64(uint byteAddress, uint64_t value); __specialized_for_target(hlsl) @@ -475,7 +475,7 @@ ${{{{ // Xor - __target_intrinsic(cuda, "atomicXor((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicXor($0._getPtrAt($1), $2)") uint64_t InterlockedXorU64(uint byteAddress, uint64_t value); __specialized_for_target(hlsl) @@ -490,7 +490,7 @@ ${{{{ // Exchange - __target_intrinsic(cuda, "atomicExch((uint64_t*)$0._getPtrAt($1), $2)") + __target_intrinsic(cuda, "atomicExch($0._getPtrAt($1), $2)") uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value); __specialized_for_target(hlsl) diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp index bcf857fc8..d4c5812b7 100644 --- a/source/slang/slang-compiler.cpp +++ b/source/slang/slang-compiler.cpp @@ -1103,14 +1103,7 @@ void printDiagnosticArg(StringBuilder& sb, CodeGenTarget val) { preprocessorDefinitions.Add(define.Key, define.Value); } - { - auto linkage = getLinkage(); - for (auto& define : linkage->preprocessorDefinitions) - { - preprocessorDefinitions.Add(define.Key, define.Value); - } - } - + { /* TODO(JS): Not totally clear what options should be set here. If we are using the pass through - then using say the defines/includes all makes total sense. If we are generating C++ code from slang, then should we really be using these values -> aren't they what is @@ -1168,6 +1161,22 @@ void printDiagnosticArg(StringBuilder& sb, CodeGenTarget val) sourceLanguage = (SourceLanguage)TypeConvertUtil::getSourceLanguageFromTarget((SlangCompileTarget)sourceTarget); } + // Add any preprocessor definitions associated with the linkage + { + // TODO(JS): This is somewhat arguable - should defines passed to Slang really be + // passed to downstream compilers? It does appear consistent with the behavior if + // there is an endToEndReq. + // + // That said it's very convenient and provides way to control aspects + // of downstream compilation. + + auto linkage = getLinkage(); + for (auto& define : linkage->preprocessorDefinitions) + { + preprocessorDefinitions.Add(define.Key, define.Value); + } + } + // If we have an extension tracker, we may need to set options such as SPIR-V version // and CUDA Shader Model. if (extensionTracker) diff --git a/source/slang/slang-lower-to-ir.cpp b/source/slang/slang-lower-to-ir.cpp index d175b69dd..86edf9282 100644 --- a/source/slang/slang-lower-to-ir.cpp +++ b/source/slang/slang-lower-to-ir.cpp @@ -6128,6 +6128,9 @@ struct DeclLoweringVisitor : DeclVisitor auto builder = getBuilder(); + // TODO(JS): Do we create something derived from IRGlobalVar? Or do we use + // a decoration to identify an *actual* global? + IRGlobalValueWithCode* irGlobal = builder->createGlobalVar(varType); LoweredValInfo globalVal = LoweredValInfo::ptr(irGlobal); diff --git a/tests/compute/bound-check-zero-index.slang b/tests/compute/bound-check-zero-index.slang new file mode 100644 index 000000000..e8244886e --- /dev/null +++ b/tests/compute/bound-check-zero-index.slang @@ -0,0 +1,56 @@ +// bound-check-zero-index.slang + +// Check 'zero indexing' bound check feature, supported by CPU and CUDA + +// Currently zero index bound checking doesn't appear to be working properly for CUDA. +//TEST(compute):COMPARE_COMPUTE:-cuda -shaderobj -Xslang... -DSLANG_ENABLE_BOUND_ZERO_INDEX -X. +//TEST(compute):COMPARE_COMPUTE:-cpu -shaderobj -Xslang... -DSLANG_ENABLE_BOUND_ZERO_INDEX -X. + +//TEST_INPUT:ubuffer(data=[1 2 3 4]):name=byteAddressBuffer +ByteAddressBuffer byteAddressBuffer; + +//TEST_INPUT:ubuffer(data=[0x10 0x20 0x30 0x40]):name=rwByteAddressBuffer +RWByteAddressBuffer rwByteAddressBuffer; + +//TEST_INPUT:ubuffer(data=[0x100 0x200 0x300 0x400], stride=4):name=structuredBuffer +StructuredBuffer structuredBuffer; + +//TEST_INPUT:ubuffer(data=[0x1000 0x2000 0x3000 0x4000], stride=4):name=rwStructuredBuffer +RWStructuredBuffer rwStructuredBuffer; + +//TEST_INPUT:ubuffer(data=[-1 -1 -1 -1], stride=4):out,name=outputBuffer +RWStructuredBuffer outputBuffer; + +//TEST_INPUT:ubuffer(data=[-1 -1 -1 -1], stride=4):out,name=outputBuffer2 +RWStructuredBuffer outputBuffer2; + +[numthreads(4, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int tid = dispatchThreadID.x; + + int fixedArray[3] = { 2, 5, 9}; + + int total = 0; + total += byteAddressBuffer.Load(tid * 4); + total += byteAddressBuffer.Load(-tid * 4); + + total += rwByteAddressBuffer.Load(tid * 4); + total += rwByteAddressBuffer.Load(-tid * 4); + + total += structuredBuffer[tid]; + total += structuredBuffer[-tid]; + + total += rwStructuredBuffer[tid]; + total += rwStructuredBuffer[-tid]; + + total += fixedArray[tid]; + total += fixedArray[-tid]; + + outputBuffer[tid] = total; + + // NOTE! Different threads could access this if being performed in parallel. + // So undeterministic if we write to same index (because out of range) when running in parallel + // By just adding one, all indices should be hit once + outputBuffer2[tid + 1] = total; +} \ No newline at end of file diff --git a/tests/compute/bound-check-zero-index.slang.expected.txt b/tests/compute/bound-check-zero-index.slang.expected.txt new file mode 100644 index 000000000..21f89147e --- /dev/null +++ b/tests/compute/bound-check-zero-index.slang.expected.txt @@ -0,0 +1,8 @@ +2226 +333A +444F +5559 +5559 +2226 +333A +444F -- cgit v1.2.3