13 files changed, 520 insertions, 110 deletions
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 12807e9e2..39a25aafa 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -509,7 +509,7 @@ void CUDASourceEmitter::emitModuleImpl(IRModule* module)
         // Output all the thread locals 
         for (auto action : actions)
         {
-            if (action.level == EmitAction::Level::Definition && _isVariable(action.inst->op))
+            if (action.level == EmitAction::Level::Definition && action.inst->op == kIROp_GlobalVar)
             {
                 emitGlobalInst(action.inst);
             }
diff --git a/source/slang/slang-ir-entry-point-uniforms.cpp b/source/slang/slang-ir-entry-point-uniforms.cpp
index ad535b747..388a7004d 100644
--- a/source/slang/slang-ir-entry-point-uniforms.cpp
+++ b/source/slang/slang-ir-entry-point-uniforms.cpp
@@ -452,6 +452,8 @@ void moveEntryPointUniformParamsToGlobalScope(
         case CodeGenTarget::Executable:
         case CodeGenTarget::SharedLibrary:
         case CodeGenTarget::HostCallable:
+        case CodeGenTarget::CUDASource:
+        case CodeGenTarget::PTX:
         {
             context.targetNeedsConstantBuffer = false;
             break;
diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp
index 772686163..644f54a95 100644
--- a/source/slang/slang-type-layout.cpp
+++ b/source/slang/slang-type-layout.cpp
@@ -730,11 +730,55 @@ struct CPUObjectLayoutRulesImpl : ObjectLayoutRulesImpl
 };
 
 
-// TODO(JS): Most likely wrong! Use CPU layout for CUDA for now
+// TODO(JS): Most likely wrong! Assumes largely CPU layout which is probably not right
 struct CUDAObjectLayoutRulesImpl : CPUObjectLayoutRulesImpl
 {
     typedef CPUObjectLayoutRulesImpl Super;
 
+    virtual SimpleLayoutInfo GetObjectLayout(ShaderParameterKind kind) override
+    {
+        switch (kind)
+        {
+            case ShaderParameterKind::ConstantBuffer:
+                // It's a pointer to the actual uniform data
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+
+            case ShaderParameterKind::MutableTexture:
+            case ShaderParameterKind::TextureUniformBuffer:
+            case ShaderParameterKind::Texture:
+                // It's a pointer to a texture interface 
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+
+            case ShaderParameterKind::StructuredBuffer:
+            case ShaderParameterKind::MutableStructuredBuffer:
+                // TODO(JS): We are just storing as a pointer for now
+                // It's a ptr and a size of the amount of elements
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+
+            case ShaderParameterKind::RawBuffer:
+            case ShaderParameterKind::Buffer:
+            case ShaderParameterKind::MutableRawBuffer:
+            case ShaderParameterKind::MutableBuffer:
+
+                // TODO(JS): We are storing as a pointer for now
+
+                // It's a pointer and a size in bytes
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+
+            case ShaderParameterKind::SamplerState:
+                // It's a pointer
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(void*), sizeof(void*));
+
+            case ShaderParameterKind::TextureSampler:
+            case ShaderParameterKind::MutableTextureSampler:
+            case ShaderParameterKind::InputRenderTarget:
+                // TODO: how to handle these?
+            default:
+                SLANG_UNEXPECTED("unhandled shader parameter kind");
+                UNREACHABLE_RETURN(SimpleLayoutInfo());
+        }
+    }
+
 };
 
 static CPUObjectLayoutRulesImpl kCPUObjectLayoutRulesImpl;
@@ -747,10 +791,10 @@ LayoutRulesImpl kCPULayoutRulesImpl_ = {
 // CUDA
 
 static CUDAObjectLayoutRulesImpl kCUDAObjectLayoutRulesImpl;
-static CUDALayoutRulesImpl kCUALayoutRulesImpl;
+static CUDALayoutRulesImpl kCUDALayoutRulesImpl;
 
 LayoutRulesImpl kCUDALayoutRulesImpl_ = {
-    &kCPULayoutRulesFamilyImpl, &kCUALayoutRulesImpl, &kCUDAObjectLayoutRulesImpl,
+    &kCUDALayoutRulesFamilyImpl, &kCUDALayoutRulesImpl, &kCUDAObjectLayoutRulesImpl,
 };
 
 
@@ -1033,12 +1077,12 @@ LayoutRulesImpl* CPULayoutRulesFamilyImpl::getStructuredBufferRules()
 
 LayoutRulesImpl* CUDALayoutRulesFamilyImpl::getConstantBufferRules()
 {
-    return &kCPULayoutRulesImpl_;
+    return &kCUDALayoutRulesImpl_;
 }
 
 LayoutRulesImpl* CUDALayoutRulesFamilyImpl::getPushConstantBufferRules()
 {
-    return &kCPULayoutRulesImpl_;
+    return &kCUDALayoutRulesImpl_;
 }
 
 LayoutRulesImpl* CUDALayoutRulesFamilyImpl::getTextureBufferRules()
diff --git a/tests/cuda/compile-to-cuda.slang b/tests/cuda/compile-to-cuda.slang
index 6166aaf0b..be7d775bd 100644
--- a/tests/cuda/compile-to-cuda.slang
+++ b/tests/cuda/compile-to-cuda.slang
@@ -1,29 +1,19 @@
 //DISABLE_TEST(smoke):SIMPLE: -target ptx -entry computeMain -stage compute 
+//DISABLE_TEST(compute):COMPARE_COMPUTE:-cpu -compute 
+//TEST(compute):COMPARE_COMPUTE:-cuda -compute 
 
 //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name=outputBuffer
 RWStructuredBuffer<int> outputBuffer : register(u0);
 
-int quantize(double value)
-{
-    return int(value * 256);
-}
-
-int quantize(float value)
-{
-    return int(value * 256);
-}
-
 [numthreads(4, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
-    float values[] = { -9, 9, -3, 3 };
 
 	int tid = int(dispatchThreadID.x);
-    float value = values[tid];
-    
-    outputBuffer[tid * 4] = quantize(sin(value));
-    outputBuffer[tid * 4 + 1] = quantize(cos(value));
     
-    outputBuffer[tid * 4 + 2] = quantize(sin(double(value)));
-    outputBuffer[tid * 4 + 3] = quantize(cos(double(value)));
+    outputBuffer[tid * 4] = tid;
+    outputBuffer[tid * 4 + 1] = tid + 1;
+    outputBuffer[tid * 4 + 2] = tid + 2; 
+    outputBuffer[tid * 4 + 3] = tid + 3;
+ 
 }
diff --git a/tools/render-test/bind-location.cpp b/tools/render-test/bind-location.cpp
index 6548e861c..30b9de0f8 100644
--- a/tools/render-test/bind-location.cpp
+++ b/tools/render-test/bind-location.cpp
@@ -551,6 +551,14 @@ void BindSet::getBindings(List<BindLocation>& outLocations, List<Value*>& outRes
     }
 }
 
+void BindSet::releaseValueTargets()
+{
+    for (Value* value : m_values)
+    {
+        value->m_target.setNull();
+    }
+}
+
 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! BindLocation !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
 BindLocation::BindLocation(slang::TypeLayoutReflection* typeLayout, const BindPoints& points, BindSet_Value* value) :
diff --git a/tools/render-test/bind-location.h b/tools/render-test/bind-location.h
index 0ce99731d..e4119a103 100644
--- a/tools/render-test/bind-location.h
+++ b/tools/render-test/bind-location.h
@@ -336,6 +336,9 @@ public:
         /// Get all of the bindings
     void getBindings(Slang::List<BindLocation>& outLocations, Slang::List<Value*>& outValues) const;
 
+        /// 
+    void releaseValueTargets();
+
         /// Ctor
     BindSet();
 
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp
index e94a6d6e1..2bb0baf88 100644
--- a/tools/render-test/cpu-compute-util.cpp
+++ b/tools/render-test/cpu-compute-util.cpp
@@ -15,42 +15,6 @@
 namespace renderer_test {
 using namespace Slang;
 
-/* static */SlangResult CPUComputeUtil::writeBindings(const ShaderInputLayout& layout, const List<BindSet::Value*>& buffers, const String& fileName)
-{
-    FILE * f = fopen(fileName.getBuffer(), "wb");
-    if (!f)
-    {
-        return SLANG_FAIL;
-    }
-
-    const auto& entries = layout.entries;
-
-    for (int i = 0; i < entries.getCount(); ++i)
-    {
-        const auto& entry = entries[i];
-        if (entry.isOutput)
-        {
-            BindSet::Value* buffer = buffers[i];
-
-            unsigned int* ptr = (unsigned int*)buffer->m_data;
-
-            const int size = int(entry.bufferData.getCount());
-            // Must be the same size or less than allocated buffer
-            SLANG_ASSERT(size * sizeof(unsigned int) <= buffer->m_sizeInBytes);
-
-            for (int i = 0; i < size; ++i)
-            {
-                unsigned int v = ptr[i];
-
-                fprintf(f, "%X\n", v);
-            }
-        }
-    }
-    fclose(f);
-    return SLANG_OK;
-}
-
-
 template <int COUNT>
 struct OneTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D
 {
@@ -109,21 +73,8 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
     // Okay lets iterate adding buffers
     auto outStream = StdWriters::getOut();
     SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(compilationAndLayout.layout.entries, compilationAndLayout.sourcePath, outStream, outContext.m_bindRoot));
-
-    {
-        const auto& entries = compilationAndLayout.layout.entries;
-        outContext.m_buffers.setCount(entries.getCount());
-
-        const auto& values = outContext.m_bindSet.getValues();
-        for (BindSet::Value* value : values)
-        {
-            if (value->m_userIndex >= 0)
-            {
-                outContext.m_buffers[value->m_userIndex] = value;
-            }
-        }
-    }
-
+    ShaderInputLayout::getValueBuffers(compilationAndLayout.layout.entries, outContext.m_bindSet, outContext.m_buffers);
+    
     // Okay we need to find all of the bindings and match up to those in the layout
     const ShaderInputLayout& layout = compilationAndLayout.layout;
 
diff --git a/tools/render-test/cpu-compute-util.h b/tools/render-test/cpu-compute-util.h
index 21c40ba43..e6e896b6a 100644
--- a/tools/render-test/cpu-compute-util.h
+++ b/tools/render-test/cpu-compute-util.h
@@ -58,8 +58,6 @@ struct CPUComputeUtil
     static SlangResult calcExecuteInfo(ExecuteStyle style, ISlangSharedLibrary* sharedLib, const uint32_t dispatchSize[3], const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& context, ExecuteInfo& out);
 
     static SlangResult execute(const ExecuteInfo& info);
-
-    static SlangResult writeBindings(const ShaderInputLayout& layout, const List<BindSet::Value*>& buffers, const Slang::String& fileName);
 };
 
 
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index 138f842b4..74810e675 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -6,13 +6,60 @@
 #include "../../source/core/slang-std-writers.h"
 #include "../../source/core/slang-token-reader.h"
 
+#include "../bind-location.h"
+
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
 namespace renderer_test {
 using namespace Slang;
 
-#define SLANG_CUDA_RETURN_ON_FAIL(x) { int _res = (int)(x); if (_res != 0) return SLANG_FAIL; }
+SLANG_FORCE_INLINE static bool _isError(CUresult result) { return result != 0; }
+SLANG_FORCE_INLINE static bool _isError(cudaError_t result) { return result != 0; }
+
+#if 0
+#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) return SLANG_FAIL; }
+#else
+
+#define SLANG_CUDA_RETURN_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); return SLANG_FAIL; } }
+
+#endif
+
+#define SLANG_CUDA_ASSERT_ON_FAIL(x) { auto _res = x; if (_isError(_res)) { SLANG_ASSERT(!"Failed CUDA call"); }; }
+
+class CUDAResource : public RefObject
+{
+public:
+    typedef RefObject Super;
+
+        /// Dtor
+    CUDAResource(): m_cudaMemory(nullptr) {}
+    CUDAResource(void* cudaMemory): m_cudaMemory(cudaMemory) {}
+
+    ~CUDAResource()
+    {
+        if (m_cudaMemory)
+        {
+            SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+        }
+    }
+
+        /// Helper function to get the cuda memory pointer when given a value
+    static void* getCUDAData(BindSet::Value* value)
+    {
+        if (value)
+        {
+            auto resource = dynamic_cast<CUDAResource*>(value->m_target.Ptr());
+            return resource ? resource->m_cudaMemory : nullptr;
+        }
+        return nullptr;
+    }
+
+    void* m_cudaMemory;
+};
+
+
+
 
 static int _calcSMCountPerMultiProcessor(int major, int minor)
 {
@@ -124,39 +171,70 @@ static SlangResult _initCuda()
     return SLANG_OK;
 }
 
-
-
-/* static */SlangResult _createDevice(CUcontext* outContext)
+class ScopeCUDAContext
 {
-    SLANG_RETURN_ON_FAIL(_initCuda());
+public:
+    ScopeCUDAContext() : m_context(nullptr) {}
+
+    SlangResult init(unsigned int flags, CUdevice device)
+    {
+        SLANG_RETURN_ON_FAIL(_initCuda());
 
-    int deviceId;
-    SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId));
-    SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId));
+        if (m_context)
+        {
+            cuCtxDestroy(m_context);
+            m_context = nullptr;
+        }
+        if (_isError(cuCtxCreate(&m_context, flags, device)))
+        {
+            return SLANG_FAIL;
+        }
+        return SLANG_OK;
+    }
 
-    CUcontext context;
+    SlangResult init(unsigned int flags)
+    {
+        SLANG_RETURN_ON_FAIL(_initCuda());
 
-    // Create context
-    SLANG_CUDA_RETURN_ON_FAIL(cuCtxCreate(&context, 0, deviceId));
+        int deviceId;
+        SLANG_RETURN_ON_FAIL(_findMaxFlopsDeviceId(&deviceId));
+        SLANG_CUDA_RETURN_ON_FAIL(cudaSetDevice(deviceId));
 
-    *outContext = context;
-    return SLANG_OK;
-}
+        if (m_context)
+        {
+            cuCtxDestroy(m_context);
+            m_context = nullptr;
+        }
+        if (_isError(cuCtxCreate(&m_context, flags, deviceId)))
+        {
+            return SLANG_FAIL;
+        }
+        return SLANG_OK;
+    }
 
-/* static */bool CUDAComputeUtil::canCreateDevice()
-{
-    CUcontext context;
-    if (SLANG_SUCCEEDED(_createDevice(&context)))
+    ~ScopeCUDAContext()
     {
-        cuCtxDestroy(context);
-        return true;
+        if (m_context)
+        {
+            cuCtxDestroy(m_context);
+        }
     }
+    SLANG_FORCE_INLINE operator CUcontext () const { return m_context; }
 
-    return false;
+    CUcontext m_context;
+};
+
+/* static */bool CUDAComputeUtil::canCreateDevice()
+{
+    ScopeCUDAContext context;
+    return SLANG_SUCCEEDED(context.init(0));
 }
 
-static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout)
+static SlangResult _compute(CUcontext context, CUmodule module, const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, CUDAComputeUtil::Context& outContext)
 {
+    auto& bindSet = outContext.m_bindSet;
+    auto& bindRoot = outContext.m_bindRoot;
+
     auto request = outputAndLayout.output.request;
     auto reflection = (slang::ShaderReflection*) spGetReflection(request);
 
@@ -170,17 +248,278 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
 
     // Get the entry point
     CUfunction kernel;
-
     SLANG_CUDA_RETURN_ON_FAIL(cuModuleGetFunction(&kernel, module, entryPointName));
 
+    // A stream of 0 means no stream
+    cudaStream_t stream = 0;
+    //SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    {
+        // Okay now we need to set up binding
+        bindRoot.init(&bindSet, reflection, 0);
+
+        // Will set up any root buffers
+        bindRoot.addDefaultValues();
+
+        // Now set up the Values from the test
+
+        auto outStream = StdWriters::getOut();
+        SLANG_RETURN_ON_FAIL(ShaderInputLayout::addBindSetValues(outputAndLayout.layout.entries, outputAndLayout.sourcePath, outStream, bindRoot));
+
+        ShaderInputLayout::getValueBuffers(outputAndLayout.layout.entries, bindSet, outContext.m_buffers);
+
+        // First create all of the resources for the values
+
+        {
+            const auto& values = bindSet.getValues();
+            const auto& entries = outputAndLayout.layout.entries;
+
+            for (BindSet::Value* value : values)
+            {
+                auto typeLayout = value->m_type;
+               
+                // Get the type kind, if typeLayout is not set we'll assume a 'constant buffer' will do
+                slang::TypeReflection::Kind kind = typeLayout ? typeLayout->getKind() : slang::TypeReflection::Kind::ConstantBuffer;
+               
+                // TODO(JS):
+                // Here we should be using information about what textures hold to create appropriate
+                // textures. For now we only support 2d textures that always return 1.
+                
+                switch (kind)
+                {
+                    case slang::TypeReflection::Kind::ConstantBuffer:
+                    case slang::TypeReflection::Kind::ParameterBlock:
+                    {
+                        // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
+
+                        void* cudaMem = nullptr;
+                        SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
+                        value->m_target = new CUDAResource(cudaMem);
+                        break;
+                    }
+                    case slang::TypeReflection::Kind::Resource:
+                    {
+                        auto type = typeLayout->getType();
+                        auto shape = type->getResourceShape();
+
+                        //auto access = type->getResourceAccess();
+
+                        switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
+                        {
+                            case SLANG_TEXTURE_2D:
+                            {
+                                SLANG_ASSERT(value->m_userIndex >= 0);
+                                auto& srcEntry = entries[value->m_userIndex];
+
+                                // TODO(JS):
+                                // We should use the srcEntry to determine what data to store in the texture,
+                                // it's dimensions etc. For now we just support it being 1.
+
+                                slang::TypeReflection* typeReflection = typeLayout->getResourceResultType();
+
+                                int count = 1;
+                                if (typeReflection->getKind() == slang::TypeReflection::Kind::Vector)
+                                {
+                                    count = int(typeReflection->getElementCount());
+                                }
+
+                                // TODO(JS): Should use the input setup to work how to create this texture
+                                // Store the target specific value
+                                //value->m_target = _newOneTexture2D(count);
+                                break;
+                            }
+                            case SLANG_TEXTURE_1D:
+                            case SLANG_TEXTURE_3D:
+                            case SLANG_TEXTURE_CUBE:
+                            case SLANG_TEXTURE_BUFFER:
+                            {
+                                // Need a CPU impl for these...
+                                // For now we can just leave as target will just be nullptr
+                                break;
+                            }
+
+                            case SLANG_BYTE_ADDRESS_BUFFER:
+                            case SLANG_STRUCTURED_BUFFER:
+                            {
+                                // On CPU we just use the memory in the BindSet buffer, so don't need to create anything
+
+                                void* cudaMem = nullptr;
+                                SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
+                                value->m_target = new CUDAResource(cudaMem);
+
+                                break;
+                            }
+                        }
+                    }
+                    default: break;
+                }
+            }
+        }
+    
+        // Now we need to go through all of the bindings and set the appropriate data
+
+        {
+            List<BindLocation> locations;
+            List<BindSet::Value*> values;
+            bindSet.getBindings(locations, values);
+
+            for (Index i = 0; i < locations.getCount(); ++i)
+            {
+                const auto& location = locations[i];
+                BindSet::Value* value = values[i];
+
+                // Okay now we need to set up the actual handles that CPU will follow.
+                auto typeLayout = location.getTypeLayout();
+
+                const auto kind = typeLayout->getKind();
+                switch (kind)
+                {
+                    case slang::TypeReflection::Kind::Array:
+                    {
+                        auto elementCount = int(typeLayout->getElementCount());
+                        if (elementCount == 0)
+                        {
+                            void** array = location.getUniform<void*>();
+                            // If set, we setup the data needed for array on CPU side
+                            if (value && array)
+                            {
+                                // TODO(JS): For now we'll just assume a pointer...
+                                *array = CUDAResource::getCUDAData(value);
+                            }
+                        }
+                        break;
+                    }
+                    case slang::TypeReflection::Kind::ConstantBuffer:
+                    case slang::TypeReflection::Kind::ParameterBlock:
+                    {
+                        // These map down to just pointers
+                        *location.getUniform<void*>() = CUDAResource::getCUDAData(value);
+                        break;
+                    }
+                    case slang::TypeReflection::Kind::Resource:
+                    {
+                        auto type = typeLayout->getType();
+                        auto shape = type->getResourceShape();
+
+                        //auto access = type->getResourceAccess();
+
+                        switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
+                        {
+                            case SLANG_BYTE_ADDRESS_BUFFER:
+                            case SLANG_STRUCTURED_BUFFER:
+                            {
+                                // TODO(JS): These will need bounds ... 
+                                // For the moment these are just pointers
+                                *location.getUniform<void*>() = CUDAResource::getCUDAData(value);
+                                break;
+                            }
+                        }
+                        break;
+                    }
+                    default: break;
+                }
+            }
+        }
+
+        // Okay now the memory is all set up, we can copy everything over
+        {
+            const auto& values = bindSet.getValues();
+            for (BindSet::Value* value : values)
+            {
+                void* cudaMem = CUDAResource::getCUDAData(value);
+                if (value && value->m_data && cudaMem)
+                {
+                    // Okay copy the data over...
+                    SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(cudaMem, value->m_data, value->m_sizeInBytes, cudaMemcpyHostToDevice));
+                }
+            }
+        }
+
+        // Now we can execute the kernel
+
+        {
+            // Get the max threads per block for this function
+
+            int maxTheadsPerBlock;
+            SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&maxTheadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel));
+
+            int sharedSizeInBytes;
+            SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel));
+
+            // Work out the args
+            void* uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue());
+            void* entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+
+            // NOTE! These are pointers to the cuda memory pointers
+            void* args[] = { &entryPointCUDAData , &uniformCUDAData };
+
+            SlangUInt numThreadsPerAxis[3];
+            entryPoint->getComputeThreadGroupSize(3, numThreadsPerAxis);
+
+            // Launch
+            // TODO(JS): We probably want to do something a little more clever here using the maxThreadsPerBlock,
+            // but for now just launch a single block, and hope it all fits.
+
+            auto cudaLaunchResult = cuLaunchKernel(kernel,
+                1, 1, 1,                                                                                // Blocks
+                int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]),        // Threads per block
+                0,                                                                                      // Shared memory size
+                stream,                                                                                 // Stream. 0 is no stream.
+                args,                                                                                   // Args
+                nullptr);                                                                               // extra
+
+            SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
+
+            if (stream)
+            {
+                SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(stream));
+            }
+            else
+            {
+                // Do a sync here. Makes sure any issues are detected early and not on some implicit sync
+                SLANG_CUDA_RETURN_ON_FAIL(cudaDeviceSynchronize()); 
+            }
+        }
+
+        // Finally we need to copy the data back
+
+        {
+            const auto& entries = outputAndLayout.layout.entries;
+
+            for (Index i = 0; i < entries.getCount(); ++i)
+            {
+                const auto& entry = entries[i];
+                BindSet::Value* value = outContext.m_buffers[i];
+
+                if (entry.isOutput)
+                {
+                    // Copy back to CPU memory
+                    void* cudaMem = CUDAResource::getCUDAData(value);
+                    if (value && value->m_data && cudaMem)
+                    {
+                        // Okay copy the data back...
+                        SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(value->m_data, cudaMem, value->m_sizeInBytes, cudaMemcpyDeviceToHost));
+                    }
+                }
+            }
+        }
+
+        if (stream)
+        {
+            SLANG_CUDA_RETURN_ON_FAIL(cudaStreamDestroy(stream));
+        }
+    }
+
+    // Release all othe CUDA resource/allocations
+    bindSet.releaseValueTargets();
 
     return SLANG_OK;
 }
 
-/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout)
+/* static */SlangResult CUDAComputeUtil::execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, Context& outContext)
 {
-    CUcontext context;
-    SLANG_RETURN_ON_FAIL(_createDevice(&context));
+    ScopeCUDAContext cudaContext;
+    SLANG_RETURN_ON_FAIL(cudaContext.init(0));
 
     const Index index = outputAndLayout.output.findKernelDescIndex(StageType::Compute);
     if (index < 0)
@@ -193,12 +532,10 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
     CUmodule module = 0;
     SLANG_CUDA_RETURN_ON_FAIL(cuModuleLoadData(&module, kernel.codeBegin));
 
-    SLANG_RETURN_ON_FAIL(_compute(context, module, outputAndLayout));
+    SLANG_RETURN_ON_FAIL(_compute(cudaContext, module, outputAndLayout, outContext));
 
     SLANG_CUDA_RETURN_ON_FAIL(cuModuleUnload(module));
 
-    cuCtxDestroy(context);
-
     return SLANG_OK;
 }
 
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
index 9c7d83b1f..58ca21716 100644
--- a/tools/render-test/cuda/cuda-compute-util.h
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -10,7 +10,16 @@ namespace renderer_test {
 
 struct CUDAComputeUtil
 {
-    static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout);
+    struct Context
+    {
+            /// Holds the binding information
+        BindSet m_bindSet;
+        CPULikeBindRoot m_bindRoot;
+            /// Buffers are held in same order as entries in layout (useful for dumping out bindings)
+        List<BindSet::Value*> m_buffers;
+    };
+
+    static SlangResult execute(const ShaderCompilerUtil::OutputAndLayout& outputAndLayout, Context& outContext);
 
     static bool canCreateDevice();
 };
diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp
index d91592ccf..050a6d2c8 100644
--- a/tools/render-test/render-test-main.cpp
+++ b/tools/render-test/render-test-main.cpp
@@ -583,7 +583,7 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe
             if (gOptions.outputPath)
             {
                 // Dump everything out that was written
-                SLANG_RETURN_ON_FAIL(CPUComputeUtil::writeBindings(compilationAndLayout.layout, context.m_buffers, gOptions.outputPath));
+                SLANG_RETURN_ON_FAIL(ShaderInputLayout::writeBindings(compilationAndLayout.layout, context.m_buffers, gOptions.outputPath));
 
                 // Check all execution styles produce the same result
                 SLANG_RETURN_ON_FAIL(CPUComputeUtil::checkStyleConsistency(sharedLibrary, gOptions.computeDispatchSize, compilationAndLayout));
@@ -600,10 +600,14 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe
 
 #if RENDER_TEST_CUDA
 
-        // TODO(JS):
-        // We don't know how to execute it yet..
+        CUDAComputeUtil::Context context;
+        SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout, context));
 
-        SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout));
+        if (gOptions.outputPath)
+        {
+            // Dump everything out that was written
+            SLANG_RETURN_ON_FAIL(ShaderInputLayout::writeBindings(compilationAndLayout.layout, context.m_buffers, gOptions.outputPath));
+        }
 
         return SLANG_OK;
 #else
diff --git a/tools/render-test/shader-input-layout.cpp b/tools/render-test/shader-input-layout.cpp
index 5ae35b90d..ee4f5fc2c 100644
--- a/tools/render-test/shader-input-layout.cpp
+++ b/tools/render-test/shader-input-layout.cpp
@@ -1,3 +1,6 @@
+// Stop warnings from Visual Studio
+#define _CRT_SECURE_NO_WARNINGS 1
+
 #include "shader-input-layout.h"
 #include "core/slang-token-reader.h"
 
@@ -676,6 +679,61 @@ namespace renderer_test
         return SLANG_OK;
     }
 
+    /* static */void ShaderInputLayout::getValueBuffers(const Slang::List<ShaderInputLayoutEntry>& entries, const BindSet& bindSet, List<BindSet::Value*>& outBuffers)
+    {
+        outBuffers.setCount(entries.getCount());
+
+        for (Index i = 0; i< outBuffers.getCount(); ++i)
+        {
+            outBuffers[i] = nullptr;
+        }
+
+        const auto& values = bindSet.getValues();
+        for (BindSet::Value* value : values)
+        {
+            if (value->m_userIndex >= 0)
+            {
+                outBuffers[value->m_userIndex] = value;
+            }
+        }
+    }
+
+
+    /* static */SlangResult ShaderInputLayout::writeBindings(const ShaderInputLayout& layout, const List<BindSet::Value*>& buffers, const String& fileName)
+    {
+        FILE * f = fopen(fileName.getBuffer(), "wb");
+        if (!f)
+        {
+            return SLANG_FAIL;
+        }
+
+        const auto& entries = layout.entries;
+
+        for (int i = 0; i < entries.getCount(); ++i)
+        {
+            const auto& entry = entries[i];
+            if (entry.isOutput)
+            {
+                BindSet::Value* buffer = buffers[i];
+
+                unsigned int* ptr = (unsigned int*)buffer->m_data;
+
+                const int size = int(entry.bufferData.getCount());
+                // Must be the same size or less than allocated buffer
+                SLANG_ASSERT(size * sizeof(unsigned int) <= buffer->m_sizeInBytes);
+
+                for (int i = 0; i < size; ++i)
+                {
+                    unsigned int v = ptr[i];
+
+                    fprintf(f, "%X\n", v);
+                }
+            }
+        }
+        fclose(f);
+        return SLANG_OK;
+    }
+
     void generateTextureData(TextureData& output, const InputTextureDesc& desc)
     {
         switch (desc.format)
diff --git a/tools/render-test/shader-input-layout.h b/tools/render-test/shader-input-layout.h
index 3e33f876e..3399df848 100644
--- a/tools/render-test/shader-input-layout.h
+++ b/tools/render-test/shader-input-layout.h
@@ -102,6 +102,12 @@ public:
         /// For buffers, the Resources will be setup with the contents of the entry.
         /// That if a resource is created that maps to an entry, the m_userData member of Resource will be set to it's index
     static SlangResult addBindSetValues(const Slang::List<ShaderInputLayoutEntry>& entries, const Slang::String& sourcePath, Slang::WriterHelper outError, BindRoot& bindRoot);
+
+        /// Put into outBuffer the value buffers that were set via addbindSetValues (which will set m_userIndex to be the entries index)
+    static void getValueBuffers(const Slang::List<ShaderInputLayoutEntry>& entries, const BindSet& bindSet, Slang::List<BindSet::Value*>& outBuffers);
+
+        /// Write bindings from values in memory from buffers
+    static SlangResult writeBindings(const ShaderInputLayout& layout, const Slang::List<BindSet::Value*>& buffers, const Slang::String& fileName);
 };
 
 void generateTextureDataRGB8(TextureData& output, const InputTextureDesc& desc);