Small fixes for CUDA code emit (#1564)

* Small fixes for CUDA code emit * Add a CUDA translation to `GroupMemoryBarrierWithWaveSync()`. We map this to `__syncwarp()` for CUDA (with no mask, implying a full-warp sync). * Consistently use `SLANG_PRELUDE_ASSERT` for assertions introduced in code emit (rather than just using the bare `assert(...)` function, which is not included by our CUDA prelude by default) * Add a new `SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT` flag to the CUDA prelude that allows the `count` field to be omitted from `(RW)StructuredBuffer<T>`. This is a bit of a hacky because the computed layouts will still assume the `count` field is present, but this feature is required by at least one client application for now. A better long-term fix will take more time to design and implement. * fixup: CUDA prelude code fix for pedantic compilers Co-authored-by: Tim Foley <tim.foley.is@gmail.com> Co-authored-by: Yong He <yonghe@outlook.com>
author: Tim Foley <tfoleyNV@users.noreply.github.com> 2020-10-05 11:10:53 -0700
committer: GitHub <noreply@github.com> 2020-10-05 11:10:53 -0700
commit: 41d8610653cacfb763e3e1a1c538e17037703108 (patch)
tree: 4f9b0c9c7e46a47feac40ecfa6b6b3a9cb027921
parent: d930c65e7fef6414af363e1f8d4fff52beb448af (diff)
3 files changed, 35 insertions, 14 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 9b485dbe5..aebcffc10 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -339,27 +339,47 @@ SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
 // Missing  Load(_In_  int  Location, _Out_ uint Status);
 
 template <typename T>
-struct RWStructuredBuffer
+struct StructuredBuffer
 {
-    SLANG_CUDA_CALL T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
-    SLANG_CUDA_CALL const T& Load(size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }  
+    SLANG_CUDA_CALL const T& operator[](size_t index) const
+    {
+#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
+        SLANG_CUDA_BOUND_CHECK(index, count);
+#endif
+        return data[index];
+    }
+
+    SLANG_CUDA_CALL const T& Load(size_t index) const
+    {
+#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
+        SLANG_CUDA_BOUND_CHECK(index, count);
+#endif
+        return data[index];
+    }
+
+#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
     SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
-  
+#endif
+
     T* data;
+#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
     size_t count;
+#endif
 };
 
 template <typename T>
-struct StructuredBuffer
+struct RWStructuredBuffer : StructuredBuffer<T>
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
-    SLANG_CUDA_CALL const T& Load(size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
-    SLANG_CUDA_CALL void GetDimensions(uint32_t* outNumStructs, uint32_t* outStride) { *outNumStructs = uint32_t(count); *outStride = uint32_t(sizeof(T)); }
-    
-    T* data;
-    size_t count;
+    SLANG_CUDA_CALL T& operator[](size_t index) const
+    {
+#ifndef SLANG_CUDA_STRUCTURED_BUFFER_NO_COUNT
+        SLANG_CUDA_BOUND_CHECK(index, this->count);
+#endif
+        return this->data[index];
+    }
 };
 
+
     
 // Missing  Load(_In_  int  Location, _Out_ uint Status);
 struct ByteAddressBuffer
@@ -1205,7 +1225,7 @@ __inline__ __device__ uint4 _waveMatchMultiple(WarpMask mask, const T& inVal)
 
 __device__ uint getAt(dim3 a,  int b)
 {
-    assert(b >= 0 && b < 3);
+    SLANG_PRELUDE_ASSERT(b >= 0 && b < 3);
     return (&a.x)[b];
 }
 __device__ uint3 operator*(uint3 a, dim3 b)
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 7c5ca0027..e677b9020 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2987,6 +2987,7 @@ __glsl_extension(GL_KHR_shader_subgroup_basic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBarrier()")
 __target_intrinsic(hlsl, "GroupMemoryBarrier()")
+__target_intrinsic(cuda, "__syncwarp()")
 void GroupMemoryBarrierWithWaveSync();
 
 // NOTE! WaveMaskBroadcastLaneAt is *NOT* standard HLSL
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 5359740b4..1e2482bb5 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -984,7 +984,7 @@ void CPPSourceEmitter::_emitGetAtDefinition(const UnownedStringSlice& funcName,
         {
             int vecSize = int(getIntVal(vectorType->getElementCount()));
 
-            writer->emit("assert(b >= 0 && b < ");
+            writer->emit("SLANG_PRELUDE_ASSERT(b >= 0 && b < ");
             writer->emit(vecSize);
             writer->emit(");\n");
             if (lValue)
@@ -997,7 +997,7 @@ void CPPSourceEmitter::_emitGetAtDefinition(const UnownedStringSlice& funcName,
             //int colCount = int(getIntVal(matrixType->getColumnCount()));
             int rowCount = int(getIntVal(matrixType->getRowCount()));
 
-            writer->emit("assert(b >= 0 && b < ");
+            writer->emit("SLANG_PRELUDE_ASSERT(b >= 0 && b < ");
             writer->emit(rowCount);
             writer->emit(");\n");
author	Tim Foley <tfoleyNV@users.noreply.github.com>	2020-10-05 11:10:53 -0700
committer	GitHub <noreply@github.com>	2020-10-05 11:10:53 -0700
commit	41d8610653cacfb763e3e1a1c538e17037703108 (patch)
tree	4f9b0c9c7e46a47feac40ecfa6b6b3a9cb027921
parent	d930c65e7fef6414af363e1f8d4fff52beb448af (diff)