CUDA implement StructuredBuffer/ByteAddressBuffer as pointer/count as is on CPU. (#1182)

Allow bounds check to zero index. Update docs.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-01-27 15:04:29 -0500
committer: GitHub <noreply@github.com> 2020-01-27 15:04:29 -0500
commit: a9e1beeb003644f4034b9485ad00e273ad52c9f1 (patch)
tree: b93ef4d3e3c972798f6a76a4bdd0d6d4c369924c
parent: d98a2b75c9b4a31de0ebfb1084a68b5be5ede17d (diff)
7 files changed, 241 insertions, 51 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md
index 41fc98790..01803c145 100644
--- a/docs/cuda-target.md
+++ b/docs/cuda-target.md
@@ -69,7 +69,7 @@ struct UniformState
 {
     CUtexObject tex;                // This is the combination of a texture and a sampler(!)
     SamplerState sampler;           // This variable exists within the layout, but it's value is not used.
-    int32_t* outputBuffer;          // Currently Structured buffers are converted to pointers - this will likely change in the future (for bounds checking and other reasons)
+    RWStructuredBuffer<int32_t> outputBuffer;    // This is implemented as a template in the CUDA prelude. It's just a pointer, and a size
     Thing* thing3;                  // Constant buffers map to pointers
 };   
 
@@ -81,6 +81,20 @@ With CUDA - the caller specifies how threading is broken up, so `[numthreads]` i
 
 The UniformState and UniformEntryPointParams struct typically vary by shader. UniformState holds 'normal' bindings, whereas UniformEntryPointParams hold the uniform entry point parameters. Where specific bindings or parameters are located can be determined by reflection. The structures for the example above would be something like the following... 
 
+`StructuredBuffer<T>`,`RWStructuredBuffer<T>` become
+
+```
+    T* data;
+    size_t count;
+```    
+
+`ByteAddressBuffer`, `RWByteAddressBuffer` become 
+
+```
+    uint32_t* data;
+    size_t sizeInBytes;
+```  
+
 ## Unsized arrays
 
 WIP: Not implemented yet.
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 28e423b31..63388c7f3 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -10,11 +10,30 @@
 #define SLANG_FORCE_INLINE inline
 #define SLANG_INLINE inline
 
+// Bound checks. Can be replaced by defining before including header. 
+// NOTE! 
+// The default behaviour, if out of bounds is to index 0. This is of course quite wrong - and different 
+// behavior to hlsl typically. The problem here though is more around a write reference. That unless 
+// some kind of proxy is used it is hard and/or slow to emulate the typical GPU behavior.
+
+#ifndef SLANG_CUDA_BOUND_CHECK
+#   define SLANG_CUDA_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); index = (index < count) ? index : 0; 
+#endif
+
+#ifndef SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK
+#   define SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, size, count) SLANG_PRELUDE_ASSERT(index + 4 <= sizeInBytes && (index & 3) == 0); index = (index + 4 <= sizeInBytes) ? index : 0; 
+#endif    
+
+// Here we don't have the index zeroing behavior, as such bounds checks are generally not on GPU targets either. 
+#ifndef SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK
+#   define SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, count) SLANG_PRELUDE_ASSERT(index < count); 
+#endif
+
 template <typename T, size_t SIZE>
 struct FixedArray
 {
-    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
-    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_PRELUDE_ASSERT(index < SIZE); return m_data[index]; }
+    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, SIZE); return m_data[index]; }
+    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_FIXED_ARRAY_BOUND_CHECK(index, SIZE); return m_data[index]; }
     
     T m_data[SIZE];
 };
@@ -178,6 +197,131 @@ SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v)
     return __popc(v);
 }
 
+// ----------------------------- ResourceType -----------------------------------------
+
+
+// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-structuredbuffer-getdimensions
+// Missing  Load(_In_  int  Location, _Out_ uint Status);
+
+template <typename T>
+struct RWStructuredBuffer
+{
+    SLANG_CUDA_CALL T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_CUDA_CALL const T& Load(size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }  
+    SLANG_CUDA_CALL void GetDimensions(uint32_t& outNumStructs, uint32_t& outStride) { outNumStructs = uint32_t(count); outStride = uint32_t(sizeof(T)); }
+  
+    T* data;
+    size_t count;
+};
+
+template <typename T>
+struct StructuredBuffer
+{
+    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_CUDA_CALL const T& Load(size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_CUDA_CALL void GetDimensions(uint32_t& outNumStructs, uint32_t& outStride) { outNumStructs = uint32_t(count); outStride = uint32_t(sizeof(T)); }
+    
+    T* data;
+    size_t count;
+};
+
+    
+// Missing  Load(_In_  int  Location, _Out_ uint Status);
+struct ByteAddressBuffer
+{
+    SLANG_CUDA_CALL void GetDimensions(uint32_t& outDim) const { outDim = uint32_t(sizeInBytes); }
+    SLANG_CUDA_CALL uint32_t Load(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes);
+        return data[index >> 2]; 
+    }
+    SLANG_CUDA_CALL uint2 Load2(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes); 
+        const size_t dataIdx = index >> 2; 
+        return uint2{data[dataIdx], data[dataIdx + 1]}; 
+    }
+    SLANG_CUDA_CALL uint3 Load3(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
+    }
+    SLANG_CUDA_CALL uint4 Load4(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
+    }
+    
+    const uint32_t* data;
+    size_t sizeInBytes;  //< Must be multiple of 4
+};
+
+// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer
+// Missing support for Atomic operations 
+// Missing support for Load with status
+struct RWByteAddressBuffer
+{
+    SLANG_CUDA_CALL void GetDimensions(uint32_t& outDim) const { outDim = uint32_t(sizeInBytes); }
+    
+    SLANG_CUDA_CALL uint32_t Load(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes);
+        return data[index >> 2]; 
+    }
+    SLANG_CUDA_CALL uint2 Load2(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        return uint2{data[dataIdx], data[dataIdx + 1]}; 
+    }
+    SLANG_CUDA_CALL uint3 Load3(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        return uint3{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2]}; 
+    }
+    SLANG_CUDA_CALL uint4 Load4(size_t index) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        return uint4{data[dataIdx], data[dataIdx + 1], data[dataIdx + 2], data[dataIdx + 3]}; 
+    }
+    
+    SLANG_CUDA_CALL void Store(size_t index, uint32_t v) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 4, sizeInBytes);
+        data[index >> 2] = v; 
+    }
+    SLANG_CUDA_CALL void Store2(size_t index, uint2 v) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 8, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        data[dataIdx + 0] = v.x;
+        data[dataIdx + 1] = v.y;
+    }
+    SLANG_CUDA_CALL void Store3(size_t index, uint3 v) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 12, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        data[dataIdx + 0] = v.x;
+        data[dataIdx + 1] = v.y;
+        data[dataIdx + 2] = v.z;
+    }
+    SLANG_CUDA_CALL void Store4(size_t index, uint4 v) const 
+    { 
+        SLANG_CUDA_BYTE_ADDRESS_BOUND_CHECK(index, 16, sizeInBytes);
+        const size_t dataIdx = index >> 2; 
+        data[dataIdx + 0] = v.x;
+        data[dataIdx + 1] = v.y;
+        data[dataIdx + 2] = v.z;
+        data[dataIdx + 3] = v.w;
+    }
+    
+    uint32_t* data;
+    size_t sizeInBytes; //< Must be multiple of 4 
+};
 
 /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
 
diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp
index bc7d1f4f6..6464592a5 100644
--- a/source/core/slang-nvrtc-compiler.cpp
+++ b/source/core/slang-nvrtc-compiler.cpp
@@ -174,6 +174,16 @@ static SlangResult _parseLocation(const UnownedStringSlice& in, DownstreamDiagno
     return SLANG_OK;
 }
 
+static bool _isDriveLetter(char c)
+{
+    return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+static bool _hasDriveLetter(const UnownedStringSlice& line)
+{
+    return line.size() > 2 && line[1] == ':' && _isDriveLetter(line[0]);
+}
+
 static SlangResult _parseNVRTCLine(const UnownedStringSlice& line, DownstreamDiagnostic& outDiagnostic)
 {
     typedef DownstreamDiagnostic Diagnostic;
@@ -182,7 +192,17 @@ static SlangResult _parseNVRTCLine(const UnownedStringSlice& line, DownstreamDia
     outDiagnostic.stage = Diagnostic::Stage::Compile;
 
     List<UnownedStringSlice> split;
-    StringUtil::split(line, ':', split);
+    if (_hasDriveLetter(line))
+    {
+        // The drive letter has :, which confuses things, so skip that and then fix up first entry 
+        UnownedStringSlice lineWithoutDrive(line.begin() + 2, line.end());
+        StringUtil::split(lineWithoutDrive, ':', split);
+        split[0] = UnownedStringSlice(line.begin(), split[0].end());
+    }
+    else
+    {
+        StringUtil::split(line, ':', split);
+    }
 
     if (split.getCount() == 3)
     {
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 26d6eada0..83ad4a0f8 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -254,21 +254,6 @@ SlangResult CUDASourceEmitter::calcTypeName(IRType* type, CodeGenTarget target,
             out << prefix << vecCount;
             return SLANG_OK;
         }
-        case kIROp_HLSLStructuredBufferType:
-        {
-            auto bufferType = as<IRHLSLStructuredBufferType>(type);
-            out << "const ";
-            calcTypeName(bufferType->getElementType(), target, out);
-            out << "* ";
-            return SLANG_OK;
-        }
-        case kIROp_HLSLRWStructuredBufferType:
-        {
-            auto bufferType = as<IRHLSLRWStructuredBufferType>(type);
-            calcTypeName(bufferType->getElementType(), target, out);
-            out << "* ";
-            return SLANG_OK;
-        }
 
 #if 0
         case kIROp_MatrixType:
diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp
index cf793b52d..2eec26ee6 100644
--- a/source/slang/slang-type-layout.cpp
+++ b/source/slang/slang-type-layout.cpp
@@ -694,29 +694,29 @@ struct CPUObjectLayoutRulesImpl : ObjectLayoutRulesImpl
         {
             case ShaderParameterKind::ConstantBuffer:
                 // It's a pointer to the actual uniform data
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
 
             case ShaderParameterKind::MutableTexture:
             case ShaderParameterKind::TextureUniformBuffer:
             case ShaderParameterKind::Texture:
                 // It's a pointer to a texture interface 
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
                 
             case ShaderParameterKind::StructuredBuffer:            
             case ShaderParameterKind::MutableStructuredBuffer:
                 // It's a ptr and a size of the amount of elements
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, sizeof(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, SLANG_ALIGN_OF(void*));
 
             case ShaderParameterKind::RawBuffer:
             case ShaderParameterKind::Buffer:
             case ShaderParameterKind::MutableRawBuffer:
             case ShaderParameterKind::MutableBuffer:
                 // It's a pointer and a size in bytes
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, sizeof(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, SLANG_ALIGN_OF(void*));
 
             case ShaderParameterKind::SamplerState:
                 // It's a pointer
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
  
             case ShaderParameterKind::TextureSampler:
             case ShaderParameterKind::MutableTextureSampler:
@@ -756,19 +756,15 @@ struct CUDAObjectLayoutRulesImpl : CPUObjectLayoutRulesImpl
 
             case ShaderParameterKind::StructuredBuffer:
             case ShaderParameterKind::MutableStructuredBuffer:
-                // TODO(JS): We are just storing as a pointer for now
-                // It's a ptr and a size of the amount of elements
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
+                // It's a pointer and a size
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, SLANG_ALIGN_OF(void*));
 
             case ShaderParameterKind::RawBuffer:
             case ShaderParameterKind::Buffer:
             case ShaderParameterKind::MutableRawBuffer:
             case ShaderParameterKind::MutableBuffer:
-
-                // TODO(JS): We are storing as a pointer for now
-
                 // It's a pointer and a size in bytes
-                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), SLANG_ALIGN_OF(void*));
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*) * 2, SLANG_ALIGN_OF(void*));
 
             case ShaderParameterKind::SamplerState:
                 // In CUDA it seems that sampler states are combined into texture objects.
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index a21747554..aa82d8d70 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -44,15 +44,15 @@ public:
         }
     }
 
+    static CUDAResource* getCUDAResource(BindSet::Value* value)
+    {
+        return value ? dynamic_cast<CUDAResource*>(value->m_target.Ptr()) : nullptr;
+    }
         /// Helper function to get the cuda memory pointer when given a value
     static void* getCUDAData(BindSet::Value* value)
     {
-        if (value)
-        {
-            auto resource = dynamic_cast<CUDAResource*>(value->m_target.Ptr());
-            return resource ? resource->m_cudaMemory : nullptr;
-        }
-        return nullptr;
+        auto resource = getCUDAResource(value);
+        return resource ? resource->m_cudaMemory : nullptr;
     }
 
     void* m_cudaMemory;
@@ -63,6 +63,7 @@ class CUDATextureResource : public RefObject
 public:
     typedef RefObject Super;
 
+    CUDATextureResource() {}
     CUDATextureResource(CUtexObject cudaTexObj, CUdeviceptr cudaMemory, CUarray cudaArray):
         m_cudaTexObj(cudaTexObj),
         m_cudaMemory(cudaMemory),
@@ -85,16 +86,16 @@ public:
         }
     }
 
-    static CUtexObject getCUDATexObject(BindSet::Value* value)
+    static CUDATextureResource* getCUDATextureResource(BindSet::Value* value)
     {
-        if (value)
-        {
-            auto resource = dynamic_cast<CUDATextureResource*>(value->m_target.Ptr());
-            // It's an assumption here that 0 is okay for null. Seems to work...
-            return resource ? resource->m_cudaTexObj : CUtexObject(0);
-        }
+        return value ? dynamic_cast<CUDATextureResource*>(value->m_target.Ptr()) : nullptr;
+    }
 
-        return CUtexObject(0);
+    static CUtexObject getCUDATexObject(BindSet::Value* value)
+    {
+        auto resource = getCUDATextureResource(value);
+        // It's an assumption here that 0 is okay for null. Seems to work...
+        return resource ? resource->m_cudaTexObj : CUtexObject(0);
     }
 
 protected:
@@ -526,7 +527,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             case SLANG_TEXTURE_CUBE:
                             case SLANG_TEXTURE_BUFFER:
                             {
-                                // Need a CPU impl for these...
+                                // Need a CUDA impl for these...
                                 // For now we can just leave as target will just be nullptr
                                 break;
                             }
@@ -535,7 +536,6 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                             case SLANG_STRUCTURED_BUFFER:
                             {
                                 // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
-
                                 void* cudaMem = nullptr;
                                 SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
                                 value->m_target = new CUDAResource(cudaMem);
@@ -598,12 +598,31 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
 
                         switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
                         {
-                            case SLANG_BYTE_ADDRESS_BUFFER:
                             case SLANG_STRUCTURED_BUFFER:
                             {
-                                // TODO(JS): These will need bounds ... 
-                                // For the moment these are just pointers
-                                *location.getUniform<void*>() = CUDAResource::getCUDAData(value);
+                                CUDAComputeUtil::StructuredBuffer buffer = { nullptr, 0 };
+                                auto resource = CUDAResource::getCUDAResource(value);
+                                if (resource)
+                                {
+                                    buffer.data = resource->m_cudaMemory;
+                                    buffer.count = value->m_elementCount;
+                                }
+
+                                location.setUniform(&buffer, sizeof(buffer));
+                                break;
+                            }
+                            case SLANG_BYTE_ADDRESS_BUFFER:
+                            {
+                                CUDAComputeUtil::ByteAddressBuffer buffer = { nullptr, 0 };
+
+                                auto resource = CUDAResource::getCUDAResource(value);
+                                if (resource)
+                                {
+                                    buffer.data = resource->m_cudaMemory;
+                                    buffer.sizeInBytes = value->m_sizeInBytes;
+                                }
+
+                                location.setUniform(&buffer, sizeof(buffer));
                                 break;
                             }
                             case SLANG_TEXTURE_1D:
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
index 58ca21716..ea58b6343 100644
--- a/tools/render-test/cuda/cuda-compute-util.h
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -10,6 +10,18 @@ namespace renderer_test {
 
 struct CUDAComputeUtil
 {
+        /// NOTE! MUST match up to definitions in the CUDA prelude
+    struct ByteAddressBuffer
+    {
+        void* data;
+        size_t sizeInBytes;
+    };
+    struct StructuredBuffer
+    {
+        void* data;
+        size_t count;
+    };
+
     struct Context
     {
             /// Holds the binding information
author	jsmall-nvidia <jsmall@nvidia.com>	2020-01-27 15:04:29 -0500
committer	GitHub <noreply@github.com>	2020-01-27 15:04:29 -0500
commit	a9e1beeb003644f4034b9485ad00e273ad52c9f1 (patch)
tree	b93ef4d3e3c972798f6a76a4bdd0d6d4c369924c
parent	d98a2b75c9b4a31de0ebfb1084a68b5be5ede17d (diff)