Improved bounds checking for C++/CUDA (#2263)

* #include an absolute path didn't work - because paths were taken to always be relative. * Use TerminatedUnownedStringSlice for literals in output C++. * Remove Escape/Unescape functions used in slang-token-reader.cpp Add target type of 'host-cpp' etc to map to the target types. * Fix some corner cases around string encoding. * Added unit test for string escaping. Fixed some assorted escaping bugs. * Updated test output. * Added decode test. * Stop using hex output, to get around 'greedy' aspect. Use octal instead. * Added HostHostCallable Small changes to use ArtifactDesc/Info instead of large switches. * Fix C++ emit to handle arbitrary function export. * Add options handling for callable without an output being specified. * Can compile with COM interface. Added example using com interface. * Use the IR Ptr type instead of hack in C++ emit for interfaces. * Fix issue with outputting the COM call when ptr is used. * Fix crash issue on compilation failure. * Add support for __global. * Added `ActualGlobalRate` Added special handling around globals and COM interfaces. Tested out in cpu-com-example. * Fix typo in NodeBase. * Support for accessing globals by name working. * Bounds checking for C++ Improved bounds checks for CUDA. * Check that actual global initialization is working. * Fix typo. * Refactor the com replacement such that it doesn't need a cache or do anything special with GlobalVar. * Fix typo in CUDA prelude. * Remove context. Only create replacement if needed. * Split out COM host-callable into a unit-test. * host-callable com testing on C++and llvm. * Comment around the COM ptr replacement. * WIP Zero bound test. * Disable com test on vs 32 bit. Fix C++ prelude * Disable 32 bit targets testing com host-callable. * For now disable zero index test. * Enable bounds checking for CPU/CUDA. * Small fixes. Disable CUDA zero index bound fix. * Add test result for bound check. * Work around for index wrapping issue. * Added Fixed array test. * Only enable prelude asserts via SLANG_PRELUDE_ENABLE_ASSERT (unless defined by the user)
author: jsmall-nvidia <jsmall@nvidia.com> 2022-06-08 19:51:49 -0400
committer: GitHub <noreply@github.com> 2022-06-08 19:51:49 -0400
commit: 4db6bd3cd6da1871fdac520c280bd9f933e48489 (patch)
tree: e4e1bf347a1ceac708ce598af7d4ca4bab71e013
parent: 1146920bc9ed9bef2b5bb91b3cdec4700eb09881 (diff)
9 files changed, 242 insertions, 109 deletions
diff --git a/prelude/slang-cpp-prelude.h b/prelude/slang-cpp-prelude.h
index ff6bb8f6f..0381a7bb6 100644
--- a/prelude/slang-cpp-prelude.h
+++ b/prelude/slang-cpp-prelude.h
@@ -194,7 +194,6 @@ Any platforms not detected by the above logic are now now explicitly zeroed out.
 #    endif
 #endif /* SLANG_PLATFORM */
 
-
 /* Shorthands for "families" of compilers/platforms */
 #define SLANG_GCC_FAMILY (SLANG_CLANG || SLANG_SNC || SLANG_GHS || SLANG_GCC)
 #define SLANG_WINDOWS_FAMILY (SLANG_WINRT || SLANG_WIN32 || SLANG_WIN64)
@@ -249,8 +248,12 @@ convention for interface methods.
 #   define SLANG_MCALL SLANG_STDCALL
 #endif
 
+#ifndef SLANG_FORCE_INLINE
+#    define SLANG_FORCE_INLINE inline
+#endif
 
-
+// TODO(JS): Should these be in slang-cpp-types.h? 
+// They are more likely to clash with slang.h
 
 struct SlangUUID
 {
@@ -271,6 +274,8 @@ struct ISlangUnknown
 
 #endif // SLANG_H
 
+// Includes
+
 #include "slang-cpp-types.h"
 #include "slang-cpp-scalar-intrinsics.h"
 
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 64db2efb3..7aef25650 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -2,15 +2,48 @@
 #define SLANG_PRELUDE_CPP_TYPES_H
 
 #ifndef SLANG_PRELUDE_ASSERT
-#   ifdef _DEBUG
+#   ifdef SLANG_PRELUDE_ENABLE_ASSERT
 #       define SLANG_PRELUDE_ASSERT(VALUE) assert(VALUE)
 #   else
 #       define SLANG_PRELUDE_ASSERT(VALUE) 
 #   endif
 #endif
 
-#ifndef SLANG_FORCE_INLINE
-#    define SLANG_FORCE_INLINE inline
+// Since we are using unsigned arithmatic care is need in this comparison.
+// It is *assumed* that sizeInBytes >= elemSize. Which means (sizeInBytes >= elemSize) >= 0
+// Which means only a single test is needed
+
+// Asserts for bounds checking.
+// It is assumed index/count are unsigned types.
+#define SLANG_BOUND_ASSERT(index, count)  SLANG_PRELUDE_ASSERT(index < count); 
+#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0);
+
+// Macros to zero index if an access is out of range
+#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; 
+#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; 
+
+// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX
+// the fix macro will zero the index, if out of range
+#ifdef  SLANG_ENABLE_BOUND_ZERO_INDEX
+#   define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#else
+#   define SLANG_BOUND_FIX(index, count) 
+#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) 
+#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) 
+#endif
+
+#ifndef SLANG_BOUND_CHECK
+#   define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count)
+#endif
+
+#ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS
+#   define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#endif
+
+#ifndef SLANG_BOUND_CHECK_FIXED_ARRAY
+#   define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
 #endif
 
 #ifdef SLANG_PRELUDE_NAMESPACE
@@ -25,8 +58,8 @@ struct TypeInfo
 template <typename T, size_t SIZE>
 struct FixedArray
 {
-    const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
-    T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
+    const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
+    T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
 
     T m_data[SIZE];
 };
@@ -36,8 +69,8 @@ struct FixedArray
 template <typename T>
 struct Array
 {
-    const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
-    T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
+    const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; }
 
     T* data;
     size_t count;
@@ -126,8 +159,8 @@ typedef size_t NonUniformResourceIndex;
 template <typename T>
 struct RWStructuredBuffer
 {
-    SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }  
+    SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }  
     void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
   
     T* data;
@@ -137,8 +170,8 @@ struct RWStructuredBuffer
 template <typename T>
 struct StructuredBuffer
 {
-    SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
+    SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
     void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
     
     T* data;
@@ -149,8 +182,8 @@ struct StructuredBuffer
 template <typename T>
 struct RWBuffer
 {
-    SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
+    SLANG_FORCE_INLINE T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
     void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); }
     
     T* data;
@@ -160,8 +193,8 @@ struct RWBuffer
 template <typename T>
 struct Buffer
 {
-    SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
-    const T& Load(size_t index) const { SLANG_PRELUDE_ASSERT(index < count); return data[index]; }
+    SLANG_FORCE_INLINE const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    const T& Load(size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
     void GetDimensions(uint32_t* outCount) { *outCount = uint32_t(count); }
     
     T* data;
@@ -174,32 +207,32 @@ struct ByteAddressBuffer
     void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); }
     uint32_t Load(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
         return data[index >> 2]; 
     }
     uint2 Load2(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 8 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint2{data[dataIdx], data[dataIdx + 1]}; 
     }
     uint3 Load3(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 12 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
     }
     uint4 Load4(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 16 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
     }
     template<typename T>
-    T Load(size_t offset) const
+    T Load(size_t index) const
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        return *(T const*)((char*)data + offset);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        return *(const T*)(((const char*)data) + index);
     }
     
     const uint32_t* data;
@@ -215,49 +248,49 @@ struct RWByteAddressBuffer
     
     uint32_t Load(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
         return data[index >> 2]; 
     }
     uint2 Load2(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 8 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint2{data[dataIdx], data[dataIdx + 1]}; 
     }
     uint3 Load3(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 12 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
     }
     uint4 Load4(size_t index) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 16 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
     }
     template<typename T>
-    T Load(size_t offset) const
+    T Load(size_t index) const
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        return *(T const*)((char*)data + offset);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        return *(const T*)(((const char*)data) + index);
     }
 
     void Store(size_t index, uint32_t v) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
         data[index >> 2] = v; 
     }
     void Store2(size_t index, uint2 v) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 8 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
     }
     void Store3(size_t index, uint3 v) const 
-    { 
-        SLANG_PRELUDE_ASSERT(index + 12 <= sizeInBytes && (index & 3) == 0); 
+    {  
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
@@ -265,7 +298,7 @@ struct RWByteAddressBuffer
     }
     void Store4(size_t index, uint4 v) const 
     { 
-        SLANG_PRELUDE_ASSERT(index + 16 <= sizeInBytes && (index & 3) == 0); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
@@ -273,10 +306,10 @@ struct RWByteAddressBuffer
         data[dataIdx + 3] = v.w;
     }
     template<typename T>
-    void Store(size_t offset, T const& value) const
+    void Store(size_t index, T const& value) const
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        *(T*)((char*)data + offset) = value;
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        *(T*)(((char*)data) + index) = value;
     }
 
     uint32_t* data;
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 01c658e0b..448b69c63 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -46,24 +46,44 @@
 #define SLANG_FORCE_INLINE inline
 #define SLANG_INLINE inline
 
-// Bound checks. Can be replaced by defining before including header. 
-// NOTE! 
-// The default behaviour, if out of bounds is to index 0. This is of course quite wrong - and different 
-// behavior to hlsl typically. The problem here though is more around a write reference. That unless 
-// some kind of proxy is used it is hard and/or slow to emulate the typical GPU behavior.
-
-#ifndef SLANG_CUDA_BOUND_CHECK
-#   define SLANG_CUDA_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); index = (index < count) ? index : 0; 
+
+// Since we are using unsigned arithmatic care is need in this comparison.
+// It is *assumed* that sizeInBytes >= elemSize. Which means (sizeInBytes >= elemSize) >= 0
+// Which means only a single test is needed
+
+// Asserts for bounds checking.
+// It is assumed index/count are unsigned types.
+#define SLANG_BOUND_ASSERT(index, count)  SLANG_PRELUDE_ASSERT(index < count); 
+#define SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_PRELUDE_ASSERT(index <= (sizeInBytes - elemSize) && (index & 3) == 0);
+
+// Macros to zero index if an access is out of range
+#define SLANG_BOUND_ZERO_INDEX(index, count) index = (index < count) ? index : 0; 
+#define SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes) index = (index <= (sizeInBytes - elemSize)) ? index : 0; 
+
+// The 'FIX' macro define how the index is fixed. The default is to do nothing. If SLANG_ENABLE_BOUND_ZERO_INDEX
+// the fix macro will zero the index, if out of range
+#ifdef  SLANG_ENABLE_BOUND_ZERO_INDEX
+#   define SLANG_BOUND_FIX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ZERO_INDEX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) SLANG_BOUND_ZERO_INDEX(index, count) SLANG_BOUND_ZERO_INDEX(index, count)
+#else
+#   define SLANG_BOUND_FIX(index, count) 
+#   define SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes) 
+#   define SLANG_BOUND_FIX_FIXED_ARRAY(index, count) 
+#endif
+
+#ifndef SLANG_BOUND_CHECK
+#   define SLANG_BOUND_CHECK(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX(index, count)
 #endif
 
-#ifndef SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK
-#   define SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, size, count) SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); index = (index + 4 <= sizeInBytes) ? index : 0; 
-#endif    
+#ifndef SLANG_BOUND_CHECK_BYTE_ADDRESS
+#   define SLANG_BOUND_CHECK_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_ASSERT_BYTE_ADDRESS(index, elemSize, sizeInBytes) SLANG_BOUND_FIX_BYTE_ADDRESS(index, elemSize, sizeInBytes)
+#endif
 
-// Here we don't have the index zeroing behavior, as such bounds checks are generally not on GPU targets either. 
-#ifndef SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK
-#   define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); 
+#ifndef SLANG_BOUND_CHECK_FIXED_ARRAY
+#   define SLANG_BOUND_CHECK_FIXED_ARRAY(index, count) SLANG_BOUND_ASSERT(index, count) SLANG_BOUND_FIX_FIXED_ARRAY(index, count)
 #endif
+
  // This macro handles how out-of-range surface coordinates are handled; 
  // I can equal
  // cudaBoundaryModeClamp, in which case out-of-range coordinates are clamped to the valid range
@@ -91,8 +111,8 @@ struct TypeInfo
 template <typename T, size_t SIZE>
 struct FixedArray
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, SIZE); return m_data[index]; }
-    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, SIZE); return m_data[index]; }
+    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
+    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK_FIXED_ARRAY(index, SIZE); return m_data[index]; }
     
     T m_data[SIZE];
 };
@@ -102,8 +122,8 @@ struct FixedArray
 template <typename T>
 struct Array
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
-    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_BOUND_CHECK(index, count); return data[index]; }
     
     T* data;
     size_t count;
@@ -714,7 +734,7 @@ struct StructuredBuffer
     SLANG_CUDA_CALL const T& operator[](size_t index) const
     {
 #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
-        SLANG_CUDA_BOUND_CHECK(index, count);
+        SLANG_BOUND_CHECK(index, count);
 #endif
         return data[index];
     }
@@ -722,7 +742,7 @@ struct StructuredBuffer
     SLANG_CUDA_CALL const T& Load(size_t index) const
     {
 #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
-        SLANG_CUDA_BOUND_CHECK(index, count);
+        SLANG_BOUND_CHECK(index, count);
 #endif
         return data[index];
     }
@@ -743,46 +763,44 @@ struct RWStructuredBuffer : StructuredBuffer<T>
     SLANG_CUDA_CALL T& operator[](size_t index) const
     {
 #ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
-        SLANG_CUDA_BOUND_CHECK(index, this->count);
+        SLANG_BOUND_CHECK(index, this->count);
 #endif
         return this->data[index];
     }
 };
 
-
-    
 // Missing  Load(_In_  int  Location, _Out_ uint Status);
 struct ByteAddressBuffer
 {
     SLANG_CUDA_CALL void GetDimensions(uint32_t* outDim) const { *outDim = uint32_t(sizeInBytes); }
     SLANG_CUDA_CALL uint32_t Load(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
         return data[index >> 2]; 
     }
     SLANG_CUDA_CALL uint2 Load2(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes); 
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes); 
         const size_t dataIdx = index >> 2; 
         return uint2{data[dataIdx], data[dataIdx + 1]}; 
     }
     SLANG_CUDA_CALL uint3 Load3(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
     }
     SLANG_CUDA_CALL uint4 Load4(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
     }
     template<typename T>
-    SLANG_CUDA_CALL T Load(size_t offset) const
+    SLANG_CUDA_CALL T Load(size_t index) const
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        return *(T const*)((char*)data + offset);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        return *(const T*)(((const char*)data) + index);
     }
     
     const uint32_t* data;
@@ -798,49 +816,49 @@ struct RWByteAddressBuffer
     
     SLANG_CUDA_CALL uint32_t Load(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
         return data[index >> 2]; 
     }
     SLANG_CUDA_CALL uint2 Load2(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint2{data[dataIdx], data[dataIdx + 1]}; 
     }
     SLANG_CUDA_CALL uint3 Load3(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
     }
     SLANG_CUDA_CALL uint4 Load4(size_t index) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
     }
     template<typename T>
-    SLANG_CUDA_CALL T Load(size_t offset) const
+    SLANG_CUDA_CALL T Load(size_t index) const
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        return *(T const*)((char*)data + offset);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        return *(const T*)((const char*)data + index);
     }
     
     SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 4, sizeInBytes);
         data[index >> 2] = v; 
     }
     SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 8, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
     }
     SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 12, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
@@ -848,7 +866,7 @@ struct RWByteAddressBuffer
     }
     SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const 
     { 
-        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, 16, sizeInBytes);
         const size_t dataIdx = index >> 2; 
         data[dataIdx + 0] = v.x;
         data[dataIdx + 1] = v.y;
@@ -856,17 +874,18 @@ struct RWByteAddressBuffer
         data[dataIdx + 3] = v.w;
     }
     template<typename T>
-    SLANG_CUDA_CALL void Store(size_t offset, T const& value) const
+    SLANG_CUDA_CALL void Store(size_t index, T const& value) const
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        *(T*)((char*)data + offset) = value;
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        *(T*)(((char*)data) + index) = value;
     }
     
         /// Can be used in stdlib to gain access
-    SLANG_CUDA_CALL uint* _getPtrAt(size_t offset)
+    template <typename T>
+    SLANG_CUDA_CALL T* _getPtrAt(size_t index)
     {
-        SLANG_PRELUDE_ASSERT(offset + sizeof(T) <= sizeInBytes && (offset & (alignof(T)-1)) == 0); 
-        return (uint*)(((char*)data) + offset);
+        SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
+        return (T*)(((char*)data) + index);
     }
     
     uint32_t* data;
diff --git a/prelude/slang-llvm.h b/prelude/slang-llvm.h
index 08d6a74dd..b41380581 100644
--- a/prelude/slang-llvm.h
+++ b/prelude/slang-llvm.h
@@ -7,14 +7,14 @@
 #define SLANG_DISABLE_EXCEPTIONS 1
 
 #ifndef SLANG_PRELUDE_ASSERT
-#   ifdef DEBUG
+#   ifdef SLANG_PRELUDE_ENABLE_ASSERT
 extern "C" void assertFailure(const char* msg);
 #       define SLANG_PRELUDE_EXPECT(VALUE, MSG) if(VALUE) {} else assertFailure("assertion failed: '" MSG "'")
 #       define SLANG_PRELUDE_ASSERT(VALUE) SLANG_PRELUDE_EXPECT(VALUE, #VALUE)
-#   else // DEBUG
-
+#   else // SLANG_PRELUDE_ENABLE_ASSERT
+#       define SLANG_PRELUDE_EXPECT(VALUE, MSG)
 #       define SLANG_PRELUDE_ASSERT(x) 
-#   endif // DEBUG
+#   endif // SLANG_PRELUDE_ENABLE_ASSERT
 #endif
 
 /*
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 7d107888a..b2f6fa06b 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -331,7 +331,7 @@ ${{{{
 
     __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
     __cuda_sm_version(2.0)
-    __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
+    __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))")
     [__requiresNVAPI]
     void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue);
 
@@ -347,7 +347,7 @@ ${{{{
     __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
     [__requiresNVAPI]
     __cuda_sm_version(2.0)
-    __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<float>($1), $2)")
     void InterlockedAddF32(uint byteAddress, float valueToAdd);
 
     __specialized_for_target(glsl)
@@ -359,7 +359,7 @@ ${{{{
 
     // Int64 Add
     __cuda_sm_version(6.0)
-    __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))")
+    __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))")
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
 
     __specialized_for_target(hlsl)
@@ -377,7 +377,7 @@ ${{{{
 
     // Without returning original value
     __cuda_sm_version(6.0)
-    __target_intrinsic(cuda, "atomicAdd((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<uint64_t>($1), $2)")
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
 
     __specialized_for_target(hlsl)
@@ -395,7 +395,7 @@ ${{{{
 
     // Cas uint64_t
 
-    __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))")
+    __target_intrinsic(cuda, "(*$4 = atomicCAS($0._getPtrAt<uint64_t>($1), $2, $3))")
     void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);
 
     __specialized_for_target(hlsl)
@@ -414,7 +414,7 @@ ${{{{
     // Max
 
     __cuda_sm_version(3.5)
-    __target_intrinsic(cuda, "atomicMax((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicMax($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -430,7 +430,7 @@ ${{{{
     // Min
     
     __cuda_sm_version(3.5)
-    __target_intrinsic(cuda, "atomicMin((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicMin($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedMinU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -445,7 +445,7 @@ ${{{{
 
     // And
 
-    __target_intrinsic(cuda, "atomicAnd((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicAnd($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedAndU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -460,7 +460,7 @@ ${{{{
 
     // Or
 
-    __target_intrinsic(cuda, "atomicOr((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicOr($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedOrU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -475,7 +475,7 @@ ${{{{
 
     // Xor
 
-    __target_intrinsic(cuda, "atomicXor((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicXor($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedXorU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
@@ -490,7 +490,7 @@ ${{{{
 
     // Exchange
 
-    __target_intrinsic(cuda, "atomicExch((uint64_t*)$0._getPtrAt($1), $2)")
+    __target_intrinsic(cuda, "atomicExch($0._getPtrAt<uint64_t>($1), $2)")
     uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value);
 
     __specialized_for_target(hlsl)
diff --git a/source/slang/slang-compiler.cpp b/source/slang/slang-compiler.cpp
index bcf857fc8..d4c5812b7 100644
--- a/source/slang/slang-compiler.cpp
+++ b/source/slang/slang-compiler.cpp
@@ -1103,14 +1103,7 @@ void printDiagnosticArg(StringBuilder& sb, CodeGenTarget val)
             {
                 preprocessorDefinitions.Add(define.Key, define.Value);
             }
-            {
-                auto linkage = getLinkage();
-                for (auto& define : linkage->preprocessorDefinitions)
-                {
-                    preprocessorDefinitions.Add(define.Key, define.Value);
-                }
-            }
-
+            
             {
                 /* TODO(JS): Not totally clear what options should be set here. If we are using the pass through - then using say the defines/includes
                 all makes total sense. If we are generating C++ code from slang, then should we really be using these values -> aren't they what is
@@ -1168,6 +1161,22 @@ void printDiagnosticArg(StringBuilder& sb, CodeGenTarget val)
             sourceLanguage = (SourceLanguage)TypeConvertUtil::getSourceLanguageFromTarget((SlangCompileTarget)sourceTarget);
         }
 
+        // Add any preprocessor definitions associated with the linkage
+        {
+            // TODO(JS): This is somewhat arguable - should defines passed to Slang really be
+            // passed to downstream compilers? It does appear consistent with the behavior if 
+            // there is an endToEndReq.
+            // 
+            // That said it's very convenient and provides way to control aspects 
+            // of downstream compilation. 
+            
+            auto linkage = getLinkage();
+            for (auto& define : linkage->preprocessorDefinitions)
+            {
+                preprocessorDefinitions.Add(define.Key, define.Value);
+            }
+        }
+
         // If we have an extension tracker, we may need to set options such as SPIR-V version
         // and CUDA Shader Model.
         if (extensionTracker)
diff --git a/source/slang/slang-lower-to-ir.cpp b/source/slang/slang-lower-to-ir.cpp
index d175b69dd..86edf9282 100644
--- a/source/slang/slang-lower-to-ir.cpp
+++ b/source/slang/slang-lower-to-ir.cpp
@@ -6128,6 +6128,9 @@ struct DeclLoweringVisitor : DeclVisitor<DeclLoweringVisitor, LoweredValInfo>
 
         auto builder = getBuilder();
 
+        // TODO(JS): Do we create something derived from IRGlobalVar? Or do we use 
+        // a decoration to identify an *actual* global?
+
         IRGlobalValueWithCode* irGlobal = builder->createGlobalVar(varType);
         LoweredValInfo globalVal = LoweredValInfo::ptr(irGlobal);
 
diff --git a/tests/compute/bound-check-zero-index.slang b/tests/compute/bound-check-zero-index.slang
new file mode 100644
index 000000000..e8244886e
--- /dev/null
+++ b/tests/compute/bound-check-zero-index.slang
@@ -0,0 +1,56 @@
+// bound-check-zero-index.slang
+
+// Check 'zero indexing' bound check feature, supported by CPU and CUDA
+
+// Currently zero index bound checking doesn't appear to be working properly for CUDA.
+//TEST(compute):COMPARE_COMPUTE:-cuda -shaderobj -Xslang... -DSLANG_ENABLE_BOUND_ZERO_INDEX -X.
+//TEST(compute):COMPARE_COMPUTE:-cpu -shaderobj -Xslang... -DSLANG_ENABLE_BOUND_ZERO_INDEX -X.
+
+//TEST_INPUT:ubuffer(data=[1 2 3 4]):name=byteAddressBuffer
+ByteAddressBuffer byteAddressBuffer;
+
+//TEST_INPUT:ubuffer(data=[0x10 0x20 0x30 0x40]):name=rwByteAddressBuffer
+RWByteAddressBuffer rwByteAddressBuffer;
+
+//TEST_INPUT:ubuffer(data=[0x100 0x200 0x300 0x400], stride=4):name=structuredBuffer
+StructuredBuffer<int> structuredBuffer;
+
+//TEST_INPUT:ubuffer(data=[0x1000 0x2000 0x3000 0x4000], stride=4):name=rwStructuredBuffer
+RWStructuredBuffer<int> rwStructuredBuffer;
+
+//TEST_INPUT:ubuffer(data=[-1 -1 -1 -1], stride=4):out,name=outputBuffer
+RWStructuredBuffer<int> outputBuffer;
+
+//TEST_INPUT:ubuffer(data=[-1 -1 -1 -1], stride=4):out,name=outputBuffer2
+RWStructuredBuffer<int> outputBuffer2;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+	int tid = dispatchThreadID.x;
+    
+    int fixedArray[3] = { 2, 5, 9};
+    
+    int total = 0;
+    total += byteAddressBuffer.Load<int>(tid * 4);
+    total += byteAddressBuffer.Load<int>(-tid * 4);
+    
+    total += rwByteAddressBuffer.Load<int>(tid * 4);
+    total += rwByteAddressBuffer.Load<int>(-tid * 4);
+    
+    total += structuredBuffer[tid];
+    total += structuredBuffer[-tid];
+    
+    total += rwStructuredBuffer[tid];
+    total += rwStructuredBuffer[-tid];
+    
+    total += fixedArray[tid];
+    total += fixedArray[-tid];
+    
+    outputBuffer[tid] = total;
+    
+    // NOTE! Different threads could access this if being performed in parallel.
+    // So undeterministic if we write to same index (because out of range) when running in parallel
+    // By just adding one, all indices should be hit once
+    outputBuffer2[tid + 1] = total;
+}
+\ No newline at end of file
diff --git a/tests/compute/bound-check-zero-index.slang.expected.txt b/tests/compute/bound-check-zero-index.slang.expected.txt
new file mode 100644
index 000000000..21f89147e
--- /dev/null
+++ b/tests/compute/bound-check-zero-index.slang.expected.txt
@@ -0,0 +1,8 @@
+2226
+333A
+444F
+5559
+5559
+2226
+333A
+444F
author	jsmall-nvidia <jsmall@nvidia.com>	2022-06-08 19:51:49 -0400
committer	GitHub <noreply@github.com>	2022-06-08 19:51:49 -0400
commit	4db6bd3cd6da1871fdac520c280bd9f933e48489 (patch)
tree	e4e1bf347a1ceac708ce598af7d4ca4bab71e013
parent	1146920bc9ed9bef2b5bb91b3cdec4700eb09881 (diff)