Fixes to get shader-object example working on CUDA (#1708)

The purpose of these changes is to make the `shader-object` example work correctly on CUDA. Originally I had tried to add changes to the "flat" reflection information so that it introduced descriptor ranges to match the binding ranges it added for interface/existential-type fields. This approach helped the CUDA code that was using that information to try and compute uniform offsets for those fields, but it broke most of the other renderer back-ends. Instead, I removed the relevant asserts from `CUDAShaderObject::setObject()`. Note taht there are leftover changes from my edits to the flat reflection information, around how it handles "leaf" fields that consume multiple resource kinds. I believe that those changes are, on balance, "more correct" now than they were before, so I decided to leave them in. The other major fix here is to specialize the `CUDAShaderObject::setObject()` logic to handle the case of setting a shader object for a parameter that has interface type instead of a constant-buffer or parameter block. Mostly I just copy bytes from the child object into the parent object. There are a few caveats, though: * I am not writing the RTTI or witness-table information, so dynamic dispatch won't work. * I am assuming a hard-coded offset of 16 bytes for the any-value, which will work for now but is a bit too "magical" and might also break once we support conjunctions of interfaces with dynamic dispatch * I am assuming that the child value to be writen into the field will "fit" into the any-value area. We need some way to determine whether or not things fit dynamically (ideally using the reflection data), and adapt accordingly. * I had to add another method on the base CUDA shader object type to handle setting data using a device-memory pointr instead of a host-memory pointer * There's not a lot we can do about it, but in the case of assigning an ordinary `CUDAShaderObject` into an interface-type field of a `CUDAEntryPointShaderObject` we end up needing to perform a device->host memory copy, because the bytes of the value will have already been written to GPU memory, but need to be in GPU memory for the dispatch call. * The implementation I'm using here basically assumes that the child shader object must have been finalized before it gets plugged into the parent shader object. We haven't yet made a policy decision about that bit.
author: Tim Foley <tfoleyNV@users.noreply.github.com> 2021-02-16 14:03:39 -0800
committer: GitHub <noreply@github.com> 2021-02-16 14:03:39 -0800
commit: 39975b207e5db7de8feaaebfda2ae122c1850b26 (patch)
tree: fb4bfff1957df21a1c598e22851712702f391776 /tools
parent: e474c4e3aadc22a1b9f9b006104409f10936244f (diff)
2 files changed, 71 insertions, 8 deletions
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp
index 4f87bdfc9..906d553cb 100644
--- a/tools/gfx/cuda/render-cuda.cpp
+++ b/tools/gfx/cuda/render-cuda.cpp
@@ -250,6 +250,20 @@ public:
         slang::BindingType bindingType;
         Index count;
         Index baseIndex; // Flat index for sub-ojects
+
+        // TODO: The `uniformOffset` field should be removed,
+        // since it cannot be supported by the Slang reflection
+        // API once we fix some design issues.
+        //
+        // It is only being used today for pre-allocation of sub-objects
+        // for constant buffers and parameter blocks (which should be
+        // deprecated/removed anyway).
+        //
+        // Note: We would need to bring this field back, plus
+        // a lot of other complexity, if we ever want to support
+        // setting of resources/buffers directly by a binding
+        // range index and array index.
+        //
         Index uniformOffset; // Uniform offset for a resource typed field.
     };
 
@@ -310,6 +324,14 @@ public:
             SlangInt rangeIndexInDescriptorSet =
                 m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
 
+            // TODO: This logic assumes that for any binding range that might consume
+            // multiple kinds of resources, the descriptor range for its uniform
+            // usage will be the first one in the range.
+            //
+            // We need to decide whether that assumption is one we intend to support
+            // applications making, or whether they should be forced to perform a
+            // linear search over the descriptor ranges for a specific binding range.
+            //
             auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
                 descriptorSetIndex, rangeIndexInDescriptorSet);
 
@@ -464,6 +486,17 @@ public:
         return SLANG_OK;
     }
     virtual SLANG_NO_THROW Result SLANG_MCALL
+        setDeviceData(size_t offset, void* data, size_t size)
+    {
+        size = Math::Min(size, bufferResource->getDesc()->sizeInBytes - offset);
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+            (uint8_t*)bufferResource->m_cudaMemory + offset,
+            data,
+            size,
+            cudaMemcpyHostToDevice));
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
         getObject(ShaderOffset const& offset, IShaderObject** object)
     {
         auto subObjectIndex =
@@ -480,17 +513,34 @@ public:
     virtual SLANG_NO_THROW Result SLANG_MCALL
         setObject(ShaderOffset const& offset, IShaderObject* object)
     {
-        auto subObjectIndex =
-            getLayout()->m_bindingRanges[offset.bindingRangeIndex].baseIndex + offset.bindingArrayIndex;
-        SLANG_ASSERT(
-            offset.uniformOffset ==
-            getLayout()->m_bindingRanges[offset.bindingRangeIndex].uniformOffset +
-                offset.bindingArrayIndex * sizeof(void*));
+        auto layout = getLayout();
+        SLANG_ASSERT(offset.bindingRangeIndex >= 0);
+        SLANG_ASSERT(offset.bindingRangeIndex < layout->m_bindingRanges.getCount());
+        auto& bindingRange = layout->m_bindingRanges[offset.bindingRangeIndex];
+
+        auto subObjectIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
         auto cudaObject = dynamic_cast<CUDAShaderObject*>(object);
         if (subObjectIndex >= objects.getCount())
             objects.setCount(subObjectIndex + 1);
         objects[subObjectIndex] = cudaObject;
-        return setData(offset, &cudaObject->bufferResource->m_cudaMemory, sizeof(void*));
+
+        switch( bindingRange.bindingType )
+        {
+        default:
+            SLANG_RETURN_ON_FAIL(setData(offset, &cudaObject->bufferResource->m_cudaMemory, sizeof(void*)));
+            break;
+
+        case slang::BindingType::ExistentialValue:
+            // TODO: handle the "does it fit" logic
+            {
+                auto valueSize = cudaObject->m_layout->getElementTypeLayout()->getSize();
+                auto valueOffset = 16;
+                SLANG_RETURN_ON_FAIL(setDeviceData(offset.uniformOffset + valueOffset, cudaObject->getBuffer(), valueSize));
+            }
+            break;
+        }
+
+        return SLANG_OK;
     }
     virtual SLANG_NO_THROW Result SLANG_MCALL
         setResource(ShaderOffset const& offset, IResourceView* resourceView)
@@ -615,6 +665,19 @@ public:
         return SLANG_OK;
     }
 
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setDeviceData(size_t offset, void* data, size_t size)
+    {
+        size = Math::Min(size, uniformBufferSize - offset);
+        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(
+            (uint8_t*)hostBuffer + offset,
+            data,
+            size,
+            cudaMemcpyDeviceToHost));
+        return SLANG_OK;
+    }
+
+
     virtual SLANG_NO_THROW void* SLANG_MCALL getBuffer() override
     {
         return hostBuffer;
diff --git a/tools/gfx/render-graphics-common.cpp b/tools/gfx/render-graphics-common.cpp
index 5f083538d..9e95df95c 100644
--- a/tools/gfx/render-graphics-common.cpp
+++ b/tools/gfx/render-graphics-common.cpp
@@ -153,6 +153,7 @@ public:
                     switch (slangBindingType)
                     {
                     case slang::BindingType::ExistentialValue:
+                    case slang::BindingType::InlineUniformData:
                         continue;
                     default:
                         break;
@@ -172,7 +173,6 @@ public:
                         auto category = typeLayout->getDescriptorSetDescriptorRangeCategory(s, r);
                         descriptorRangeDesc.binding += varLayout->getOffset(category);
                     }
-
                     descriptorSetInfo->slotRangeDescs.add(descriptorRangeDesc);
                 }
             }
author	Tim Foley <tfoleyNV@users.noreply.github.com>	2021-02-16 14:03:39 -0800
committer	GitHub <noreply@github.com>	2021-02-16 14:03:39 -0800
commit	39975b207e5db7de8feaaebfda2ae122c1850b26 (patch)
tree	fb4bfff1957df21a1c598e22851712702f391776 /tools
parent	e474c4e3aadc22a1b9f9b006104409f10936244f (diff)