Add a CPU renderer implementation (#1750)

* Add a CPU renderer implementation This change adds a CPU back-end to `gfx` and ensures that most of our existing CPU tests pass when using it. Detailed notes: * Most of the CPU renderer implementation is copy-pasted from the CUDA case, so they share a lot of similar logic * The main addition to the CPU renderer is a semi-complete implementation of host-memory textures. The logic here handles all the main shapes (Buffer, 1D, 2D, 3D, Cube) and all the currently-supported `Format`s that are sample-able as-is (no D24S8). The implementation is not intended to be fast, and it currently only does nearest-neighbor sampling, but otherwise it tries to avoid cutting too many corners and should be ar reasonable starting point for a more complete (but not performance-oriented) implementation. * Refactored the CPU prelude `IRWTexture` interface to inherit from `ITexture`, since in most cases a single type will end up implementing both. It might be worth it to collapse it all down to a single interface later. * Changed the CPU prelude `ITexture`/`IRWTexture` interface so that it takes both a pointer *and* a size for output arguments. This change seems necessary to allow a shader variable declared as a `Texture2D<float>` to fetch a single `float` when the underlying texture might be using RGBA32F. * Added to the `IComponentType` public API so that we can query a "host callable" for an entry point and not just a binary. * Turned off the `-shaderobj` flag on two tests that weren't yet compatible with shader objects but still had the flag left in on the path (since previously the CPU path always used the non-`gfx` non-shader-object logic anyway) * Disabled one test (`dynamic-dispatch-11`) that relied on the `ConstantBuffer<IInterface>` idiom that we know we are planning to chagne soon anyway. * Made a few changes to the CUDA path to bring it into line with what I added for the CPU path. These were mostly bug fixes around indexing logic for sub-objects and resources. * fixup
author: Tim Foley <tfoleyNV@users.noreply.github.com> 2021-03-12 11:58:14 -0800
committer: GitHub <noreply@github.com> 2021-03-12 11:58:14 -0800
commit: d6a37a0f151e390808f196998c48a341bc4c7b60 (patch)
tree: c1c6e3af434cb3627af67ecc8706124e4b8c7fb1
parent: 9ffe2f3ef245034a2dae42017a9059dfe4d02647 (diff)
19 files changed, 2011 insertions, 104 deletions
diff --git a/build/visual-studio/gfx/gfx.vcxproj b/build/visual-studio/gfx/gfx.vcxproj
index 5f05d7586..08786b5cf 100644
--- a/build/visual-studio/gfx/gfx.vcxproj
+++ b/build/visual-studio/gfx/gfx.vcxproj
@@ -181,6 +181,7 @@
   <ItemGroup>
     <ClInclude Include="..\..\..\slang-gfx.h" />
     <ClInclude Include="..\..\..\tools\gfx\command-writer.h" />
+    <ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h" />
     <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-swapchain.h" />
     <ClInclude Include="..\..\..\tools\gfx\d3d\d3d-util.h" />
@@ -206,6 +207,7 @@
     <ClInclude Include="..\..\..\tools\gfx\vulkan\vk-util.h" />
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-swapchain.cpp" />
     <ClCompile Include="..\..\..\tools\gfx\d3d\d3d-util.cpp" />
diff --git a/build/visual-studio/gfx/gfx.vcxproj.filters b/build/visual-studio/gfx/gfx.vcxproj.filters
index c7836d62f..cff8cc95a 100644
--- a/build/visual-studio/gfx/gfx.vcxproj.filters
+++ b/build/visual-studio/gfx/gfx.vcxproj.filters
@@ -15,6 +15,9 @@
     <ClInclude Include="..\..\..\tools\gfx\command-writer.h">
       <Filter>Header Files</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\..\tools\gfx\cpu\render-cpu.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
     <ClInclude Include="..\..\..\tools\gfx\cuda\render-cuda.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -86,6 +89,9 @@
     </ClInclude>
   </ItemGroup>
   <ItemGroup>
+    <ClCompile Include="..\..\..\tools\gfx\cpu\render-cpu.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\tools\gfx\cuda\render-cuda.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 9c8fb3dec..99e8f5097 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -446,9 +446,9 @@ struct TextureDimensions
 struct ITexture
 {
     virtual TextureDimensions GetDimensions(int mipLevel = -1) = 0;
-    virtual void Load(const int* v, void* out) = 0;
-    virtual void Sample(SamplerState samplerState, const float* loc, void* out) = 0;
-    virtual void SampleLevel(SamplerState samplerState, const float* loc, float level, void* out) = 0;
+    virtual void Load(const int32_t* v, void* outData, size_t dataSize) = 0;
+    virtual void Sample(SamplerState samplerState, const float* loc, void* outData, size_t dataSize) = 0;
+    virtual void SampleLevel(SamplerState samplerState, const float* loc, float level, void* outData, size_t dataSize) = 0;
 };
 
 template <typename T>
@@ -470,9 +470,9 @@ struct Texture1D
         *outNumberOfLevels = dims.numberOfLevels; 
     }
     
-    T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out); return out; }
-    T Sample(SamplerState samplerState, float loc) const { T out; texture->Sample(samplerState, &loc, &out); return out; }
-    T SampleLevel(SamplerState samplerState, float loc, float level) { T out; texture->SampleLevel(samplerState, &loc, level, &out); return out; }
+    T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+    T Sample(SamplerState samplerState, float loc) const { T out; texture->Sample(samplerState, &loc, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, float loc, float level) { T out; texture->SampleLevel(samplerState, &loc, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
@@ -507,9 +507,9 @@ struct Texture2D
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out); return out; }
-    T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out); return out; }
-    T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out); return out; }
+    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+    T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
@@ -548,9 +548,9 @@ struct Texture3D
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out); return out; }
-    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out); return out; }
-    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out); return out; }
+    T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
@@ -585,8 +585,8 @@ struct TextureCube
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out); return out; }
-    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out); return out; }
+    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
@@ -611,9 +611,9 @@ struct Texture1DArray
         *outElements = dims.arrayElementCount; 
     }
     
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out); return out; }
-    T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out); return out; }
-    T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out); return out; }
+    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+    T Sample(SamplerState samplerState, const float2& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, const float2& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
@@ -653,9 +653,9 @@ struct Texture2DArray
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out); return out; }
-    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out); return out; }
-    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out); return out; }
+    T Load(const int4& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
+    T Sample(SamplerState samplerState, const float3& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, const float3& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
@@ -695,20 +695,16 @@ struct TextureCubeArray
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Sample(SamplerState samplerState, const float4& loc) const { T out; texture->Sample(samplerState, &loc.x, &out); return out; }
-    T SampleLevel(SamplerState samplerState, const float4& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out); return out; }
+    T Sample(SamplerState samplerState, const float4& loc) const { T out; texture->Sample(samplerState, &loc.x, &out, sizeof(out)); return out; }
+    T SampleLevel(SamplerState samplerState, const float4& loc, float level) { T out; texture->SampleLevel(samplerState, &loc.x, level, &out, sizeof(out)); return out; }
     
     ITexture* texture;              
 };
 
 /* !!!!!!!!!!!!!!!!!!!!!!!!!!! RWTexture !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */
 
-struct IRWTexture
+struct IRWTexture : ITexture
 {
-    virtual TextureDimensions GetDimensions(int mipLevel = -1) = 0;
-    
-        /// Load at specified location. 
-    virtual void Load(const int32_t* loc, void* out) = 0;
         /// Get the reference to the element at loc. 
     virtual void* refAt(const uint32_t* loc) = 0;
 };
@@ -722,7 +718,7 @@ struct RWTexture1D
     void GetDimensions(float* outWidth) { *outWidth = texture->GetDimensions().width; }
     void GetDimensions(uint32_t mipLevel, float* outWidth, float* outNumberOfLevels) { auto dims = texture->GetDimensions(mipLevel); *outWidth = dims.width; *outNumberOfLevels = dims.numberOfLevels; }
     
-    T Load(int32_t loc) const { T out; texture->Load(&loc, &out); return out; }
+    T Load(int32_t loc) const { T out; texture->Load(&loc, &out, sizeof(out)); return out; }
     T& operator[](uint32_t loc) { return *(T*)texture->refAt(&loc); }
     IRWTexture* texture;              
 };
@@ -757,7 +753,7 @@ struct RWTexture2D
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out); return out; }
+    T Load(const int2& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
     T& operator[](const uint2& loc) { return *(T*)texture->refAt(&loc.x); }
     IRWTexture* texture;
 };
@@ -796,7 +792,7 @@ struct RWTexture3D
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out); return out; }
+    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
     T& operator[](const uint3& loc) { return *(T*)texture->refAt(&loc.x); }
     IRWTexture* texture;
 };
@@ -832,7 +828,7 @@ struct RWTexture1DArray
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(int2 loc) const { T out; texture->Load(&loc.x, &out); return out; }
+    T Load(int2 loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
     T& operator[](uint2 loc) { return *(T*)texture->refAt(&loc.x); }
 
     IRWTexture* texture;
@@ -872,7 +868,7 @@ struct RWTexture2DArray
         *outNumberOfLevels = dims.numberOfLevels;
     }
     
-    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out); return out; }
+    T Load(const int3& loc) const { T out; texture->Load(&loc.x, &out, sizeof(out)); return out; }
     T& operator[](const uint3& loc) { return *(T*)texture->refAt(&loc.x); }
 
     IRWTexture* texture;
diff --git a/premake5.lua b/premake5.lua
index a5a94971d..13debb57b 100644
--- a/premake5.lua
+++ b/premake5.lua
@@ -767,6 +767,7 @@ tool "gfx"
     files {"slang-gfx.h"}
 
     -- Will compile across targets
+    addSourceDir "tools/gfx/cpu"
     addSourceDir "tools/gfx/nvapi"
 
     -- To special case that we may be building using cygwin on windows. If 'true windows' we build for dx12/vk and run the script
diff --git a/slang.h b/slang.h
index f0abc135a..b354e6b12 100644
--- a/slang.h
+++ b/slang.h
@@ -4055,6 +4055,23 @@ namespace slang
             virtual SLANG_NO_THROW SlangResult SLANG_MCALL link(
                 IComponentType**            outLinkedComponentType,
                 ISlangBlob**                outDiagnostics = nullptr) = 0;
+
+                /** Get entry point 'callable' functions accessible through the ISlangSharedLibrary interface.
+
+                The functions remain in scope as long as the ISlangSharedLibrary interface is in scope.
+
+                NOTE! Requires a compilation target of SLANG_HOST_CALLABLE.
+    
+                @param entryPointIndex  The index of the entry point to get code for.
+                @param targetIndex      The index of the target to get code for (default: zero).
+                @param outSharedLibrary A pointer to a ISharedLibrary interface which functions can be queried on.
+                @returns                A `SlangResult` to indicate success or failure.
+                */
+            virtual SLANG_NO_THROW SlangResult SLANG_MCALL getEntryPointHostCallable(
+                int                     entryPointIndex,
+                int                     targetIndex,
+                ISlangSharedLibrary**   outSharedLibrary,
+                slang::IBlob**          outDiagnostics = 0) = 0;
     };
     #define SLANG_UUID_IComponentType { 0x5bc42be8, 0x5c50, 0x4929, { 0x9e, 0x5e, 0xd1, 0x5e, 0x7c, 0x24, 0x1, 0x5f } };
 
diff --git a/source/slang/slang-compiler.h b/source/slang/slang-compiler.h
index 0eb6f992d..01bdd8502 100755
--- a/source/slang/slang-compiler.h
+++ b/source/slang/slang-compiler.h
@@ -294,6 +294,11 @@ namespace Slang
         SLANG_NO_THROW SlangResult SLANG_MCALL link(
             slang::IComponentType**         outLinkedComponentType,
             ISlangBlob**                    outDiagnostics) SLANG_OVERRIDE;
+        SLANG_NO_THROW SlangResult SLANG_MCALL getEntryPointHostCallable(
+            int                     entryPointIndex,
+            int                     targetIndex,
+            ISlangSharedLibrary**   outSharedLibrary,
+            slang::IBlob**          outDiagnostics) SLANG_OVERRIDE;
 
             /// Get the linkage (aka "session" in the public API) for this component type.
         Linkage* getLinkage() { return m_linkage; }
@@ -705,6 +710,15 @@ namespace Slang
                 outDiagnostics);
         }
 
+        SLANG_NO_THROW SlangResult SLANG_MCALL getEntryPointHostCallable(
+            int                     entryPointIndex,
+            int                     targetIndex,
+            ISlangSharedLibrary**   outSharedLibrary,
+            slang::IBlob**          outDiagnostics) SLANG_OVERRIDE
+        {
+            return Super::getEntryPointHostCallable(entryPointIndex, targetIndex, outSharedLibrary, outDiagnostics);
+        }
+
             /// Create an entry point that refers to the given function.
         static RefPtr<EntryPoint> create(
             Linkage*            linkage,
@@ -912,6 +926,15 @@ namespace Slang
                 outDiagnostics);
         }
 
+        SLANG_NO_THROW SlangResult SLANG_MCALL getEntryPointHostCallable(
+            int                     entryPointIndex,
+            int                     targetIndex,
+            ISlangSharedLibrary**   outSharedLibrary,
+            slang::IBlob**          outDiagnostics) SLANG_OVERRIDE
+        {
+            return Super::getEntryPointHostCallable(entryPointIndex, targetIndex, outSharedLibrary, outDiagnostics);
+        }
+
         SLANG_NO_THROW SlangResult SLANG_MCALL findEntryPointByName(
             char const*             name,
             slang::IEntryPoint**     outEntryPoint) SLANG_OVERRIDE
diff --git a/source/slang/slang.cpp b/source/slang/slang.cpp
index 2ad5d25f8..ae3a1f419 100644
--- a/source/slang/slang.cpp
+++ b/source/slang/slang.cpp
@@ -2733,6 +2733,33 @@ SLANG_NO_THROW SlangResult SLANG_MCALL ComponentType::getEntryPointCode(
     return SLANG_OK;
 }
 
+SLANG_NO_THROW SlangResult SLANG_MCALL ComponentType::getEntryPointHostCallable(
+    int                     entryPointIndex,
+    int                     targetIndex,
+    ISlangSharedLibrary**   outSharedLibrary,
+    slang::IBlob**          outDiagnostics)
+{
+    auto linkage = getLinkage();
+    if(targetIndex < 0 || targetIndex >= linkage->targets.getCount())
+        return SLANG_E_INVALID_ARG;
+    auto target = linkage->targets[targetIndex];
+
+    auto targetProgram = getTargetProgram(target);
+
+    DiagnosticSink sink(linkage->getSourceManager(), Lexer::sourceLocationLexer);
+    auto& entryPointResult = targetProgram->getOrCreateEntryPointResult(entryPointIndex, &sink);
+    sink.getBlobIfNeeded(outDiagnostics);
+
+    if(entryPointResult.format == ResultFormat::None )
+        return SLANG_FAIL;
+
+    ComPtr<ISlangSharedLibrary> sharedLibrary;
+    SLANG_RETURN_ON_FAIL(entryPointResult.getSharedLibrary(sharedLibrary));
+
+    *outSharedLibrary = sharedLibrary.detach();
+    return SLANG_OK;
+}
+
 RefPtr<ComponentType> ComponentType::specialize(
     SpecializationArg const*    inSpecializationArgs,
     SlangInt                    specializationArgCount,
@@ -4385,7 +4412,10 @@ SlangReflection* EndToEndCompileRequest::getReflection()
 
     auto targetReq = linkage->targets[targetIndex];
     auto targetProgram = program->getTargetProgram(targetReq);
-    auto programLayout = targetProgram->getExistingLayout();
+
+
+    DiagnosticSink sink(linkage->getSourceManager(), Lexer::sourceLocationLexer);
+    auto programLayout = targetProgram->getOrCreateLayout(&sink);
 
     return (SlangReflection*)programLayout;
 }
diff --git a/tests/compute/dynamic-dispatch-11.slang b/tests/compute/dynamic-dispatch-11.slang
index 964431aaf..d6f64aa99 100644
--- a/tests/compute/dynamic-dispatch-11.slang
+++ b/tests/compute/dynamic-dispatch-11.slang
@@ -1,8 +1,12 @@
 // Test using interface typed shader parameters with dynamic dispatch.
 
+// TODO: This test has been disabled because it relies on
+// `ConstantBuffer<IInterface>` which we expect to change
+// implementation approaches for soon.
+
 //DISABLE_TEST(compute):COMPARE_COMPUTE:-dx11 -shaderobj
 //DISABLE_TEST(compute):COMPARE_COMPUTE:-vk -shaderobj
-//TEST(compute):COMPARE_COMPUTE:-cpu -xslang -disable-specialization -shaderobj
+//DISABLE_TEST(compute):COMPARE_COMPUTE:-cpu -xslang -disable-specialization -shaderobj
 //DISABLE_TEST(compute):COMPARE_COMPUTE:-cuda -xslang -disable-specialization -shaderobj
 
 [anyValueSize(8)]
diff --git a/tests/compute/performance-profile.slang b/tests/compute/performance-profile.slang
index d8b9e31ae..24b0d04bd 100644
--- a/tests/compute/performance-profile.slang
+++ b/tests/compute/performance-profile.slang
@@ -1,5 +1,5 @@
 //TEST(compute):PERFORMANCE_PROFILE:-cpu -compute -compile-arg -O3 -compute-dispatch 256,1,1  -shaderobj
-//TEST(compute):PERFORMANCE_PROFILE:-cpu -compute -source-language cpp -compile-arg -O3 -compute-dispatch 256,1,1  -shaderobj
+//TEST(compute):PERFORMANCE_PROFILE:-cpu -compute -source-language cpp -compile-arg -O3 -compute-dispatch 256,1,1
 //TEST(compute):PERFORMANCE_PROFILE:-slang -compute -compute-dispatch 256,1,1 -shaderobj
 //TEST(compute):PERFORMANCE_PROFILE:-slang -compute -dx12 -compute-dispatch 256,1,1 -shaderobj
 //TEST(compute, vulkan):PERFORMANCE_PROFILE:-vk -compute -compute-dispatch 256,1,1 -shaderobj
diff --git a/tests/compute/unbounded-array-of-array-syntax.slang b/tests/compute/unbounded-array-of-array-syntax.slang
index 6a5f4ea6e..08ed17106 100644
--- a/tests/compute/unbounded-array-of-array-syntax.slang
+++ b/tests/compute/unbounded-array-of-array-syntax.slang
@@ -1,5 +1,5 @@
 //IGNORE_TEST:CPU_REFLECTION: -profile cs_5_0 -entry computeMain -target cpp
-//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute
 //TEST:CROSS_COMPILE:-target dxbc-assembly -entry computeMain -profile cs_5_1
 //TEST:CROSS_COMPILE:-target spirv-assembly -entry computeMain -profile cs_5_1
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
diff --git a/tools/gfx/cpu/render-cpu.cpp b/tools/gfx/cpu/render-cpu.cpp
new file mode 100644
index 000000000..faa6b3092
--- /dev/null
+++ b/tools/gfx/cpu/render-cpu.cpp
@@ -0,0 +1,1796 @@
+// render-cpu.cpp
+#include "render-cpu.h"
+
+#include "slang.h"
+#include "slang-com-ptr.h"
+#include "slang-com-helper.h"
+#include "core/slang-basic.h"
+#include "core/slang-blob.h"
+
+#include "../command-writer.h"
+#include "../renderer-shared.h"
+#include "../slang-context.h"
+
+#define SLANG_PRELUDE_NAMESPACE slang_prelude
+#include "prelude/slang-cpp-types.h"
+
+namespace gfx
+{
+using namespace Slang;
+
+class CPUBufferResource : public BufferResource
+{
+public:
+    CPUBufferResource(const Desc& _desc)
+        : BufferResource(_desc)
+    {}
+
+    ~CPUBufferResource()
+    {
+        if (m_data)
+        {
+            free(m_data);
+        }
+    }
+
+    SlangResult init()
+    {
+        m_data = malloc(m_desc.sizeInBytes);
+        if(!m_data) return SLANG_E_OUT_OF_MEMORY;
+        return SLANG_OK;
+    }
+
+    SlangResult setData(size_t offset, size_t size, void const* data)
+    {
+        memcpy((char*)m_data + offset, data, size);
+        return SLANG_OK;
+    }
+
+    void* m_data = nullptr;
+};
+
+struct CPUTextureBaseShapeInfo
+{
+    int32_t rank;
+    int32_t baseCoordCount;
+    int32_t implicitArrayElementCount;
+};
+
+static const CPUTextureBaseShapeInfo kCPUTextureBaseShapeInfos[(int)ITextureResource::Type::CountOf] =
+{
+    /* Unknown */       { 0, 0, 0 },
+    /* Buffer */        { 1, 1, 1 },
+    /* Texture1D */     { 1, 1, 1 },
+    /* Texture2D */     { 2, 2, 1 },
+    /* Texture3D */     { 3, 3, 1 },
+    /* TextureCube */   { 2, 3, 6 },
+};
+
+static CPUTextureBaseShapeInfo const* _getBaseShapeInfo(ITextureResource::Type baseShape)
+{
+    return &kCPUTextureBaseShapeInfos[(int)baseShape];
+}
+
+typedef void (*CPUTextureUnpackFunc)(void const* texelData, void* outData, size_t outSize);
+
+struct CPUTextureFormatInfo
+{
+    CPUTextureUnpackFunc unpackFunc;
+};
+
+template<int N>
+void _unpackFloatTexel(void const* texelData, void* outData, size_t outSize)
+{
+    auto input = (float const*) texelData;
+
+    float temp[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+    for(int i = 0; i < N; ++i)
+        temp[i] = input[i];
+
+    memcpy(outData, temp, outSize);
+}
+
+static inline float _unpackUnorm8Value(uint8_t value)
+{
+    return value / 255.0f;
+}
+
+template<int N>
+void _unpackUnorm8Texel(void const* texelData, void* outData, size_t outSize)
+{
+    auto input = (uint8_t const*) texelData;
+
+    float temp[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+    for(int i = 0; i < N; ++i)
+        temp[i] = _unpackUnorm8Value(input[i]);
+
+    memcpy(outData, temp, outSize);
+}
+
+void _unpackUnormBGRA8Texel(void const* texelData, void* outData, size_t outSize)
+{
+    auto input = (uint8_t const*) texelData;
+
+    float temp[4];
+    temp[0] = _unpackUnorm8Value(input[2]);
+    temp[1] = _unpackUnorm8Value(input[1]);
+    temp[2] = _unpackUnorm8Value(input[0]);
+    temp[3] = _unpackUnorm8Value(input[3]);
+
+    memcpy(outData, temp, outSize);
+}
+
+template<int N>
+void _unpackUInt16Texel(void const* texelData, void* outData, size_t outSize)
+{
+    auto input = (uint16_t const*) texelData;
+
+    uint32_t temp[4] = { 0, 0, 0, 0 };
+    for(int i = 0; i < N; ++i)
+        temp[i] = input[i];
+
+    memcpy(outData, temp, outSize);
+}
+
+template<int N>
+void _unpackUInt32Texel(void const* texelData, void* outData, size_t outSize)
+{
+    auto input = (uint32_t const*) texelData;
+
+    uint32_t temp[4] = { 0, 0, 0, 0 };
+    for(int i = 0; i < N; ++i)
+        temp[i] = input[i];
+
+    memcpy(outData, temp, outSize);
+}
+
+#define TEXTURE_FORMAT_INFO(FORMAT) static const CPUTextureFormatInfo kCPUTextureFormatInfo_##FORMAT
+
+TEXTURE_FORMAT_INFO(RGBA_Float32)      = { &_unpackFloatTexel<4> };
+TEXTURE_FORMAT_INFO(RGB_Float32)       = { &_unpackFloatTexel<3> };
+TEXTURE_FORMAT_INFO(RG_Float32)        = { &_unpackFloatTexel<2> };
+TEXTURE_FORMAT_INFO(R_Float32)         = { &_unpackFloatTexel<1> };
+TEXTURE_FORMAT_INFO(RGBA_Unorm_UInt8)  = { &_unpackUnorm8Texel<4> };
+TEXTURE_FORMAT_INFO(BGRA_Unorm_UInt8)  = { &_unpackUnormBGRA8Texel };
+TEXTURE_FORMAT_INFO(R_UInt16)          = { &_unpackUInt16Texel<1> };
+TEXTURE_FORMAT_INFO(R_UInt32)          = { &_unpackUInt32Texel<1> };
+TEXTURE_FORMAT_INFO(D_Float32)         = { &_unpackFloatTexel<1> };
+
+#undef TEXTURE_FORMAT_INFO
+
+static CPUTextureFormatInfo const* _getFormatInfo(Format format)
+{
+    switch(format)
+    {
+    case Format::D_Unorm24_S8:
+    default:
+        return nullptr;
+
+
+#define CASE(FORMAT) case Format::FORMAT: return &kCPUTextureFormatInfo_##FORMAT;
+    CASE(RGBA_Float32)
+    CASE(RGB_Float32)
+    CASE(RG_Float32)
+    CASE(R_Float32)
+    CASE(RGBA_Unorm_UInt8)
+    CASE(BGRA_Unorm_UInt8)
+    CASE(R_UInt16)
+    CASE(R_UInt32)
+    CASE(D_Float32)
+
+#undef CASE
+    }
+}
+
+class CPUTextureResource : public TextureResource
+{
+    enum { kMaxRank = 3 };
+
+public:
+    CPUTextureResource(const TextureResource::Desc& desc)
+        : TextureResource(desc)
+    {}
+    ~CPUTextureResource()
+    {
+    }
+
+    Result init(ITextureResource::SubresourceData const* initData)
+    {
+        auto desc = m_desc;
+
+        // The format of the texture will determine the
+        // size of the texels we allocate.
+        //
+        // TODO: Compressed formats usually work in terms
+        // of a fixed block size, so that we cannot actually
+        // compute a simple `texelSize` like this. Instead
+        // we should be computing a `blockSize` and then
+        // a `blockExtents` value that gives the extent
+        // in texels of each block. For uncompressed formats
+        // the block extents would be 1 along each axis.
+        //
+        auto format = desc.format;
+        auto texelSize = gfxGetFormatSize(format);
+        m_texelSize = (int32_t) texelSize;
+
+        int32_t formatBlockSize[kMaxRank] = { 1, 1, 1 };
+
+        auto baseShapeInfo = _getBaseShapeInfo(desc.type);
+        m_baseShape = baseShapeInfo;
+        if(!baseShapeInfo)
+            return SLANG_FAIL;
+
+        auto formatInfo = _getFormatInfo(desc.format);
+        m_formatInfo = formatInfo;
+        if(!formatInfo)
+            return SLANG_FAIL;
+
+        int32_t rank = baseShapeInfo->rank;
+        int32_t effectiveArrayElementCount = desc.arraySize ? desc.arraySize : 1;
+        effectiveArrayElementCount *= baseShapeInfo->implicitArrayElementCount;
+        m_effectiveArrayElementCount = effectiveArrayElementCount;
+
+        int32_t extents[kMaxRank];
+        extents[0] = desc.size.width;
+        extents[1] = desc.size.height;
+        extents[2] = desc.size.depth;
+
+        for(int32_t axis = rank; axis < kMaxRank; ++axis)
+            extents[axis] = 1;
+
+        int32_t levelCount = desc.numMipLevels;
+
+        m_mipLevels.setCount(levelCount);
+
+        int64_t totalDataSize = 0;
+        for( int32_t levelIndex = 0; levelIndex < levelCount; ++levelIndex )
+        {
+            auto& level = m_mipLevels[levelIndex];
+
+            for( int32_t axis = 0; axis < kMaxRank; ++axis )
+            {
+                int32_t extent = extents[axis] >> levelIndex;
+                if(extent < 1) extent = 1;
+                level.extents[axis] = extent;
+            }
+
+            level.strides[0] = texelSize;
+            for( int32_t axis = 1; axis < kMaxRank+1; ++axis)
+            {
+                level.strides[axis] = level.strides[axis-1]*level.extents[axis-1];
+            }
+
+            int64_t levelDataSize = texelSize;
+            levelDataSize *= effectiveArrayElementCount;
+            for( int32_t axis = 0; axis < rank; ++axis)
+                levelDataSize *= int64_t(level.extents[axis]);
+
+            level.offset = totalDataSize;
+            totalDataSize += levelDataSize;
+        }
+
+        void* textureData = malloc(totalDataSize);
+        m_data = textureData;
+
+        if( initData )
+        {
+            int32_t subResourceCounter = 0;
+            for(int32_t arrayElementIndex = 0; arrayElementIndex < effectiveArrayElementCount; ++arrayElementIndex)
+            {
+                for(int32_t mipLevel = 0; mipLevel < m_desc.numMipLevels; ++mipLevel)
+                {
+                    int32_t subResourceIndex = subResourceCounter++;
+
+                    auto dstRowStride = m_mipLevels[mipLevel].strides[1];
+                    auto dstLayerStride = m_mipLevels[mipLevel].strides[2];
+                    auto dstArrayStride = m_mipLevels[mipLevel].strides[3];
+
+                    auto textureRowSize = m_mipLevels[mipLevel].extents[0]*texelSize;
+
+                    auto rowCount = m_mipLevels[mipLevel].extents[1];
+                    auto depthLayerCount = m_mipLevels[mipLevel].extents[2];
+
+                    auto& srcImage = initData[subResourceIndex];
+                    ptrdiff_t srcRowStride = ptrdiff_t(srcImage.strideY);
+                    ptrdiff_t srcLayerStride = ptrdiff_t(srcImage.strideZ);
+
+                    char* dstLevel = (char*)textureData + m_mipLevels[mipLevel].offset;
+                    char* dstImage = dstLevel + dstArrayStride*arrayElementIndex;
+
+                    const char* srcLayer = (const char*) srcImage.data;
+                    char* dstLayer = dstImage;
+
+                    for(int32_t depthLayer = 0; depthLayer < depthLayerCount; ++depthLayer)
+                    {
+                        const char* srcRow = srcLayer;
+                        char* dstRow = dstLayer;
+
+                        for(int32_t row = 0; row < rowCount; ++row)
+                        {
+                            memcpy(dstRow, srcRow, textureRowSize);
+
+                            srcRow += srcRowStride;
+                            dstRow += dstRowStride;
+                        }
+
+                        srcLayer += srcLayerStride;
+                        dstLayer += dstLayerStride;
+                    }
+                }
+            }
+        }
+
+        return SLANG_OK;
+    }
+
+    Desc const& _getDesc() { return m_desc; }
+    Format getFormat() { return m_desc.format; }
+    int32_t getRank() { return m_baseShape->rank; }
+
+    CPUTextureBaseShapeInfo const* m_baseShape;
+    CPUTextureFormatInfo const* m_formatInfo;
+    int32_t m_effectiveArrayElementCount = 0;
+    int32_t m_texelSize = 0;
+
+    struct MipLevel
+    {
+        int32_t extents[kMaxRank];
+        int64_t strides[kMaxRank+1];
+        int64_t offset;
+    };
+    List<MipLevel>  m_mipLevels;
+    void*           m_data = nullptr;
+};
+
+class CPUResourceView : public IResourceView, public RefObject
+{
+public:
+    enum class Kind
+    {
+        Buffer,
+        Texture,
+    };
+
+    SLANG_REF_OBJECT_IUNKNOWN_ALL
+    IResourceView* getInterface(const Guid& guid)
+    {
+        if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_IResourceView)
+            return static_cast<IResourceView*>(this);
+        return nullptr;
+    }
+
+    Kind getViewKind() const { return m_kind; }
+    Desc const& getDesc() const { return m_desc; }
+
+protected:
+    CPUResourceView(Kind kind, Desc const& desc)
+        : m_kind(kind)
+        , m_desc(desc)
+    {}
+
+private:
+    Kind m_kind;
+    Desc m_desc;
+};
+
+class CPUBufferView : public CPUResourceView
+{
+public:
+    CPUBufferView(Desc const& desc, CPUBufferResource* buffer)
+        : CPUResourceView(Kind::Buffer, desc)
+        , m_buffer(buffer)
+    {}
+
+    CPUBufferResource* getBuffer() const { return m_buffer; }
+
+private:
+    RefPtr<CPUBufferResource> m_buffer;
+};
+
+class CPUTextureView : public CPUResourceView, public slang_prelude::IRWTexture
+{
+public:
+    CPUTextureView(Desc const& desc, CPUTextureResource* texture)
+        : CPUResourceView(Kind::Texture, desc)
+        , m_texture(texture)
+    {}
+
+    CPUTextureResource* getTexture() const { return m_texture; }
+
+    //
+    // ITexture interface
+    //
+
+    slang_prelude::TextureDimensions GetDimensions(int mipLevel = -1) SLANG_OVERRIDE
+    {
+        slang_prelude::TextureDimensions dimensions = {};
+
+        CPUTextureResource* texture = m_texture;
+        auto& desc = texture->_getDesc();
+        auto baseShape = texture->m_baseShape;
+
+        dimensions.arrayElementCount = desc.arraySize;
+        dimensions.numberOfLevels = desc.numMipLevels;
+        dimensions.shape = baseShape->rank;
+        dimensions.width = desc.size.width;
+        dimensions.height = desc.size.height;
+        dimensions.depth = desc.size.depth;
+
+        return dimensions;
+    }
+
+    void Load(const int32_t* texelCoords, void* outData, size_t dataSize) SLANG_OVERRIDE
+    {
+        void* texelPtr = _getTexelPtr(texelCoords);
+
+        m_texture->m_formatInfo->unpackFunc(texelPtr, outData, dataSize);
+    }
+
+    void Sample(slang_prelude::SamplerState samplerState, const float* coords, void* outData, size_t dataSize) SLANG_OVERRIDE
+    {
+        // We have no access to information from fragment quads, so we cannot
+        // compute the finite-difference derivatives needed from `coords`.
+        //
+        // The only reasonable thing to do is to sample mip level zero.
+        //
+        SampleLevel(samplerState, coords, 0.0f, outData, dataSize);
+    }
+
+    void SampleLevel(slang_prelude::SamplerState samplerState, const float* coords, float level, void* outData, size_t dataSize) SLANG_OVERRIDE
+    {
+        CPUTextureResource* texture = m_texture;
+        auto baseShape = texture->m_baseShape;
+        auto& desc = texture->_getDesc();
+        int32_t rank = baseShape->rank;
+        int32_t baseCoordCount = baseShape->baseCoordCount;
+
+        int32_t integerMipLevel = int32_t(level + 0.5f);
+        if(integerMipLevel >= desc.numMipLevels) integerMipLevel = desc.numMipLevels-1;
+        if(integerMipLevel < 0) integerMipLevel = 0;
+
+        auto& mipLevelInfo = texture->m_mipLevels[integerMipLevel];
+
+        bool isArray = (desc.arraySize != 0) || (desc.type == ITextureResource::Type::TextureCube);
+        int32_t effectiveArrayElementCount = texture->m_effectiveArrayElementCount;
+        int32_t coordIndex = baseCoordCount;
+        int32_t elementIndex = 0;
+        if( isArray )
+        {
+            elementIndex = int32_t(coords[coordIndex++] + 0.5f);
+        }
+        if(elementIndex >= effectiveArrayElementCount) elementIndex = effectiveArrayElementCount-1;
+        if(elementIndex < 0) elementIndex = 0;
+
+        // Note: for now we are just going to do nearest-neighbor sampling
+        //
+        int64_t texelOffset = mipLevelInfo.offset;
+        texelOffset += elementIndex * mipLevelInfo.strides[3];
+        for(int32_t axis = 0; axis < rank; ++axis)
+        {
+            int32_t extent = mipLevelInfo.extents[axis];
+
+            float coord = coords[axis];
+
+            // TODO: deal with wrap/clamp/repeat if `coord < 0` or `coord > 1`
+
+            int32_t integerCoord = int32_t(coord*(extent-1) + 0.5f);
+
+            if(integerCoord >= extent) integerCoord = extent-1;
+            if(integerCoord < 0) integerCoord = 0;
+
+            texelOffset += integerCoord * mipLevelInfo.strides[axis];
+        }
+
+        auto texelPtr = (char const*)texture->m_data + texelOffset;
+
+        m_texture->m_formatInfo->unpackFunc(texelPtr, outData, dataSize);
+    }
+
+    //
+    // IRWTexture interface
+    //
+
+    void* refAt(const uint32_t* texelCoords) SLANG_OVERRIDE
+    {
+        return _getTexelPtr((int32_t const*)texelCoords);
+    }
+
+private:
+    RefPtr<CPUTextureResource> m_texture;
+
+    void* _getTexelPtr(int32_t const* texelCoords)
+    {
+        CPUTextureResource* texture = m_texture;
+        auto baseShape = texture->m_baseShape;
+        auto& desc = texture->_getDesc();
+
+        int32_t rank = baseShape->rank;
+        int32_t baseCoordCount = baseShape->baseCoordCount;
+
+        bool isArray = (desc.arraySize != 0) || (desc.type == ITextureResource::Type::TextureCube);
+        bool isMultisample = desc.sampleDesc.numSamples > 1;
+        bool isBuffer = desc.type == ITextureResource::Type::Buffer;
+        bool hasMipLevels = !(isMultisample || isBuffer);
+
+        int32_t effectiveArrayElementCount = texture->m_effectiveArrayElementCount;
+
+        int32_t coordIndex = baseCoordCount;
+        int32_t elementIndex = 0;
+        if( isArray )
+        {
+            elementIndex = texelCoords[coordIndex++];
+        }
+        if(elementIndex >= effectiveArrayElementCount) elementIndex = effectiveArrayElementCount-1;
+        if(elementIndex < 0) elementIndex = 0;
+
+        int32_t mipLevel = 0;
+        if(!hasMipLevels)
+        {
+            mipLevel = texelCoords[coordIndex++];
+        }
+        if(mipLevel >= desc.numMipLevels) mipLevel = desc.numMipLevels-1;
+        if(mipLevel < 0) mipLevel = 0;
+
+        auto& mipLevelInfo = texture->m_mipLevels[mipLevel];
+
+        int64_t texelOffset = mipLevelInfo.offset;
+        texelOffset += elementIndex * mipLevelInfo.strides[3];
+        for(int32_t axis = 0; axis < rank; ++axis)
+        {
+            int32_t coord = texelCoords[axis];
+            if(coord >= mipLevelInfo.extents[axis]) coord = mipLevelInfo.extents[axis]-1;
+            if(coord < 0) coord = 0;
+
+            texelOffset += texelCoords[axis] * mipLevelInfo.strides[axis];
+        }
+
+        return (char*)texture->m_data + texelOffset;
+    }
+};
+
+class CPUShaderObjectLayout : public ShaderObjectLayoutBase
+{
+public:
+
+    // TODO: Once memory lifetime stuff is handled, there is
+    // no specific need to even track binding or sub-object
+    // ranges for CPU.
+
+    struct BindingRangeInfo
+    {
+        slang::BindingType bindingType;
+        Index count;
+        Index baseIndex; // Flat index for sub-ojects
+
+        // TODO: The `uniformOffset` field should be removed,
+        // since it cannot be supported by the Slang reflection
+        // API once we fix some design issues.
+        //
+        // It is only being used today for pre-allocation of sub-objects
+        // for constant buffers and parameter blocks (which should be
+        // deprecated/removed anyway).
+        //
+        // Note: We would need to bring this field back, plus
+        // a lot of other complexity, if we ever want to support
+        // setting of resources/buffers directly by a binding
+        // range index and array index.
+        //
+        Index uniformOffset; // Uniform offset for a resource typed field.
+    };
+
+    struct SubObjectRangeInfo
+    {
+        RefPtr<CPUShaderObjectLayout> layout;
+        Index bindingRangeIndex;
+    };
+
+    size_t m_size = 0;
+    List<SubObjectRangeInfo> subObjectRanges;
+    List<BindingRangeInfo> m_bindingRanges;
+
+    Index m_subObjectCount = 0;
+    Index m_resourceCount = 0;
+
+    CPUShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout)
+    {
+        initBase(renderer, layout);
+
+        Index subObjectCount = 0;
+        Index resourceCount = 0;
+
+        m_elementTypeLayout = _unwrapParameterGroups(layout);
+        m_size = m_elementTypeLayout->getSize();
+
+        // Compute the binding ranges that are used to store
+        // the logical contents of the object in memory. These will relate
+        // to the descriptor ranges in the various sets, but not always
+        // in a one-to-one fashion.
+
+        SlangInt bindingRangeCount = m_elementTypeLayout->getBindingRangeCount();
+        for (SlangInt r = 0; r < bindingRangeCount; ++r)
+        {
+            slang::BindingType slangBindingType = m_elementTypeLayout->getBindingRangeType(r);
+            SlangInt count = m_elementTypeLayout->getBindingRangeBindingCount(r);
+            slang::TypeLayoutReflection* slangLeafTypeLayout =
+                m_elementTypeLayout->getBindingRangeLeafTypeLayout(r);
+
+            SlangInt descriptorSetIndex = m_elementTypeLayout->getBindingRangeDescriptorSetIndex(r);
+            SlangInt rangeIndexInDescriptorSet =
+                m_elementTypeLayout->getBindingRangeFirstDescriptorRangeIndex(r);
+
+            // TODO: This logic assumes that for any binding range that might consume
+            // multiple kinds of resources, the descriptor range for its uniform
+            // usage will be the first one in the range.
+            //
+            // We need to decide whether that assumption is one we intend to support
+            // applications making, or whether they should be forced to perform a
+            // linear search over the descriptor ranges for a specific binding range.
+            //
+            auto uniformOffset = m_elementTypeLayout->getDescriptorSetDescriptorRangeIndexOffset(
+                descriptorSetIndex, rangeIndexInDescriptorSet);
+
+            Index baseIndex = 0;
+            switch (slangBindingType)
+            {
+            case slang::BindingType::ConstantBuffer:
+            case slang::BindingType::ParameterBlock:
+            case slang::BindingType::ExistentialValue:
+                baseIndex = subObjectCount;
+                subObjectCount += count;
+                break;
+
+            default:
+                baseIndex = resourceCount;
+                resourceCount += count;
+                break;
+            }
+
+            BindingRangeInfo bindingRangeInfo;
+            bindingRangeInfo.bindingType = slangBindingType;
+            bindingRangeInfo.count = count;
+            bindingRangeInfo.baseIndex = baseIndex;
+            bindingRangeInfo.uniformOffset = uniformOffset;
+            m_bindingRanges.add(bindingRangeInfo);
+        }
+
+        m_subObjectCount = subObjectCount;
+        m_resourceCount = resourceCount;
+
+        SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
+        for (SlangInt r = 0; r < subObjectRangeCount; ++r)
+        {
+            SlangInt bindingRangeIndex = m_elementTypeLayout->getSubObjectRangeBindingRangeIndex(r);
+            auto slangBindingType = m_elementTypeLayout->getBindingRangeType(bindingRangeIndex);
+            slang::TypeLayoutReflection* slangLeafTypeLayout =
+                m_elementTypeLayout->getBindingRangeLeafTypeLayout(bindingRangeIndex);
+
+            // A sub-object range can either represent a sub-object of a known
+            // type, like a `ConstantBuffer<Foo>` or `ParameterBlock<Foo>`
+            // (in which case we can pre-compute a layout to use, based on
+            // the type `Foo`) *or* it can represent a sub-object of some
+            // existential type (e.g., `IBar`) in which case we cannot
+            // know the appropraite type/layout of sub-object to allocate.
+            //
+            RefPtr<CPUShaderObjectLayout> subObjectLayout;
+            if (slangBindingType != slang::BindingType::ExistentialValue)
+            {
+                subObjectLayout =
+                    new CPUShaderObjectLayout(renderer, slangLeafTypeLayout->getElementTypeLayout());
+            }
+
+            SubObjectRangeInfo subObjectRange;
+            subObjectRange.bindingRangeIndex = bindingRangeIndex;
+            subObjectRange.layout = subObjectLayout;
+            subObjectRanges.add(subObjectRange);
+        }
+    }
+
+    size_t getSize() { return m_size; }
+    Index getResourceCount() const { return m_resourceCount; }
+    Index getSubObjectCount() const { return m_subObjectCount; }
+};
+
+class CPUEntryPointLayout : public CPUShaderObjectLayout
+{
+private:
+    slang::EntryPointLayout* m_entryPointLayout = nullptr;
+
+public:
+    CPUEntryPointLayout(
+        RendererBase*               renderer,
+        slang::EntryPointLayout*    entryPointLayout)
+        : CPUShaderObjectLayout(renderer, entryPointLayout->getTypeLayout())
+        , m_entryPointLayout(entryPointLayout)
+    {}
+
+    const char* getEntryPointName() { return m_entryPointLayout->getName(); }
+};
+
+class CPUProgramLayout : public CPUShaderObjectLayout
+{
+public:
+    slang::ProgramLayout* m_programLayout = nullptr;
+    List<RefPtr<CPUEntryPointLayout>> m_entryPointLayouts;
+
+    CPUProgramLayout(RendererBase* renderer, slang::ProgramLayout* programLayout)
+        : CPUShaderObjectLayout(renderer, programLayout->getGlobalParamsTypeLayout())
+        , m_programLayout(programLayout)
+    {
+        for (UInt i =0; i< programLayout->getEntryPointCount(); i++)
+        {
+            m_entryPointLayouts.add(new CPUEntryPointLayout(
+                renderer,
+                programLayout->getEntryPointByIndex(i)));
+        }
+
+    }
+
+    int getKernelIndex(UnownedStringSlice kernelName)
+    {
+        auto entryPointCount = (int) m_programLayout->getEntryPointCount();
+        for(int i = 0; i < entryPointCount; i++)
+        {
+            auto entryPoint = m_programLayout->getEntryPointByIndex(i);
+            if (kernelName == entryPoint->getName())
+            {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    void getKernelThreadGroupSize(int kernelIndex, UInt* threadGroupSizes)
+    {
+        auto entryPoint = m_programLayout->getEntryPointByIndex(kernelIndex);
+        entryPoint->getComputeThreadGroupSize(3, threadGroupSizes);
+    }
+
+    CPUEntryPointLayout* getEntryPoint(Index index) { return m_entryPointLayouts[index]; }
+};
+
+class CPUShaderObject : public ShaderObjectBase
+{
+public:
+    void* m_data = nullptr;
+
+    ~CPUShaderObject()
+    {
+        free(m_data);
+    }
+
+    List<RefPtr<CPUShaderObject>> m_objects;
+    List<RefPtr<CPUResourceView>> m_resources;
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        init(IDevice* device, CPUShaderObjectLayout* typeLayout);
+
+    CPUShaderObjectLayout* getLayout()
+    {
+        return static_cast<CPUShaderObjectLayout*>(m_layout.Ptr());
+    }
+
+#if 0
+    virtual SLANG_NO_THROW Result SLANG_MCALL initBuffer(IDevice* device, size_t bufferSize)
+    {
+        BufferResource::Desc bufferDesc;
+        bufferDesc.init(bufferSize);
+        bufferDesc.cpuAccessFlags |= IResource::AccessFlag::Write;
+        ComPtr<IBufferResource> constantBuffer;
+        SLANG_RETURN_ON_FAIL(renderer->createBufferResource(
+            IResource::Usage::ConstantBuffer, bufferDesc, nullptr, constantBuffer.writeRef()));
+        bufferResource = dynamic_cast<MemoryCUDAResource*>(constantBuffer.get());
+        return SLANG_OK;
+    }
+#endif
+
+#if 0
+    virtual SLANG_NO_THROW void* SLANG_MCALL getBuffer()
+    {
+        return bufferResource ? bufferResource->m_cudaMemory : nullptr;
+    }
+
+    virtual SLANG_NO_THROW size_t SLANG_MCALL getBufferSize()
+    {
+        return bufferResource ? bufferResource->getDesc()->sizeInBytes : 0;
+    }
+#endif
+
+    virtual SLANG_NO_THROW slang::TypeLayoutReflection* SLANG_MCALL getElementTypeLayout() override
+    {
+        return getLayout()->getElementTypeLayout();
+    }
+
+    virtual SLANG_NO_THROW UInt SLANG_MCALL getEntryPointCount() override { return 0; }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        getEntryPoint(UInt index, IShaderObject** outEntryPoint) override
+    {
+        *outEntryPoint = nullptr;
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setData(ShaderOffset const& offset, void const* data, size_t size)
+    {
+        size = Math::Min(size, getLayout()->getSize() - offset.uniformOffset);
+        memcpy((char*)m_data + offset.uniformOffset, data, size);
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL getObject(
+        ShaderOffset const& offset,
+        IShaderObject**     outObject)
+    {
+        auto layout = getLayout();
+
+        auto bindingRangeIndex = offset.bindingRangeIndex;
+        SLANG_ASSERT(bindingRangeIndex >= 0);
+        SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
+
+        auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
+        auto subObjectIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
+        CPUShaderObject* subObject = m_objects[subObjectIndex];
+
+        *outObject = ComPtr<IShaderObject>(subObject).detach();
+
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL setObject(
+        ShaderOffset const& offset,
+        IShaderObject*      object)
+    {
+        auto layout = getLayout();
+
+        auto bindingRangeIndex = offset.bindingRangeIndex;
+        SLANG_ASSERT(bindingRangeIndex >= 0);
+        SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
+
+        auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
+        auto subObjectIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
+
+        CPUShaderObject* subObject = static_cast<CPUShaderObject*>(object);
+        m_objects[subObjectIndex] = subObject;
+
+        switch( bindingRange.bindingType )
+        {
+        default:
+            SLANG_RETURN_ON_FAIL(setData(offset, &subObject->m_data, sizeof(void*)));
+            break;
+
+        // If the range being assigned into represents an interface/existential-type leaf field,
+        // then we need to consider how the `object` being assigned here affects specialization.
+        // We may also need to assign some data from the sub-object into the ordinary data
+        // buffer for the parent object.
+        //
+        case slang::BindingType::ExistentialValue:
+            {
+                auto renderer = getRenderer();
+
+                ComPtr<slang::ISession> slangSession;
+                SLANG_RETURN_ON_FAIL(renderer->getSlangSession(slangSession.writeRef()));
+
+                // A leaf field of interface type is laid out inside of the parent object
+                // as a tuple of `(RTTI, WitnessTable, Payload)`. The layout of these fields
+                // is a contract between the compiler and any runtime system, so we will
+                // need to rely on details of the binary layout.
+
+                // We start by querying the layout/type of the concrete value that the application
+                // is trying to store into the field, and also the layout/type of the leaf
+                // existential-type field itself.
+                //
+                auto concreteTypeLayout = subObject->getElementTypeLayout();
+                auto concreteType = concreteTypeLayout->getType();
+                //
+                auto existentialTypeLayout = layout->getElementTypeLayout()->getBindingRangeLeafTypeLayout(bindingRangeIndex);
+                auto existentialType = existentialTypeLayout->getType();
+
+                // The first field of the tuple (offset zero) is the run-time type information (RTTI)
+                // ID for the concrete type being stored into the field.
+                //
+                // TODO: We need to be able to gather the RTTI type ID from `object` and then
+                // use `setData(offset, &TypeID, sizeof(TypeID))`.
+
+                // The second field of the tuple (offset 8) is the ID of the "witness" for the
+                // conformance of the concrete type to the interface used by this field.
+                //
+                auto witnessTableOffset = offset;
+                witnessTableOffset.uniformOffset += 8;
+                //
+                // Conformances of a type to an interface are computed and then stored by the
+                // Slang runtime, so we can look up the ID for this particular conformance (which
+                // will create it on demand).
+                //
+                // Note: If the type doesn't actually conform to the required interface for
+                // this sub-object range, then this is the point where we will detect that
+                // fact and error out.
+                //
+                uint32_t conformanceID = 0xFFFFFFFF;
+                SLANG_RETURN_ON_FAIL(slangSession->getTypeConformanceWitnessSequentialID(
+                    concreteType, existentialType, &conformanceID));
+                //
+                // Once we have the conformance ID, then we can write it into the object
+                // at the required offset.
+                //
+                SLANG_RETURN_ON_FAIL(setData(witnessTableOffset, &conformanceID, sizeof(conformanceID)));
+
+                // The third field of the tuple (offset 16) is the "payload" that is supposed to
+                // hold the data for a value of the given concrete type.
+                //
+                auto payloadOffset = offset;
+                payloadOffset.uniformOffset += 16;
+
+                // There are two cases we need to consider here for how the payload might be used:
+                //
+                // * If the concrete type of the value being bound is one that can "fit" into the
+                //   available payload space,  then it should be stored in the payload.
+                //
+                // * If the concrete type of the value cannot fit in the payload space, then it
+                //   will need to be stored somewhere else.
+                //
+                if(_doesValueFitInExistentialPayload(concreteTypeLayout, existentialTypeLayout))
+                {
+                    // If the value can fit in the payload area, then we will go ahead and copy
+                    // its bytes into that area.
+                    //
+                    auto valueSize = concreteTypeLayout->getSize();
+                    SLANG_RETURN_ON_FAIL(setData(payloadOffset, subObject->m_data, valueSize));
+                }
+                else
+                {
+                    // If the value cannot fit in the payload area, then we will pass a pointer
+                    // to the sub-object instead.
+                    //
+                    // Note: The Slang compiler does not currently emit code that handles the
+                    // pointer case, but that is the expected implementation for values
+                    // that do not fit into the fixed-size payload.
+                    //
+                    SLANG_RETURN_ON_FAIL(setData(payloadOffset, &subObject->m_data, sizeof(void*)));
+                }
+            }
+            break;
+        }
+
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setResource(ShaderOffset const& offset, IResourceView* inView)
+    {
+        auto layout = getLayout();
+
+        auto bindingRangeIndex = offset.bindingRangeIndex;
+        SLANG_ASSERT(bindingRangeIndex >= 0);
+        SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
+
+        auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
+        auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
+
+
+        auto view = static_cast<CPUResourceView*>(inView);
+        m_resources[viewIndex] = view;
+
+        switch( view->getViewKind() )
+        {
+        case CPUResourceView::Kind::Texture:
+            {
+                auto textureView = static_cast<CPUTextureView*>(view);
+
+                slang_prelude::IRWTexture* textureObj = textureView;
+                SLANG_RETURN_ON_FAIL(setData(offset, &textureObj, sizeof(textureObj)));
+            }
+            break;
+
+        case CPUResourceView::Kind::Buffer:
+            {
+                auto bufferView = static_cast<CPUBufferView*>(view);
+                auto buffer = bufferView->getBuffer();
+                auto desc = *buffer->getDesc();
+
+                void* dataPtr = buffer->m_data;
+                size_t size = desc.sizeInBytes;
+                if (desc.elementSize > 1)
+                    size /= desc.elementSize;
+
+                auto ptrOffset = offset;
+                SLANG_RETURN_ON_FAIL(setData(ptrOffset, &dataPtr, sizeof(dataPtr)));
+
+                auto sizeOffset = offset;
+                sizeOffset.uniformOffset += sizeof(dataPtr);
+                SLANG_RETURN_ON_FAIL(setData(sizeOffset, &size, sizeof(size)));
+            }
+            break;
+        }
+
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        setSampler(ShaderOffset const& offset, ISamplerState* sampler)
+    {
+        SLANG_UNUSED(sampler);
+        SLANG_UNUSED(offset);
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL setCombinedTextureSampler(
+        ShaderOffset const& offset, IResourceView* textureView, ISamplerState* sampler)
+    {
+        SLANG_UNUSED(sampler);
+        setResource(offset, textureView);
+        return SLANG_OK;
+    }
+
+    // Appends all types that are used to specialize the element type of this shader object in `args` list.
+    virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override
+    {
+        // TODO: the logic here is a copy-paste of `GraphicsCommonShaderObject::collectSpecializationArgs`,
+        // consider moving the implementation to `ShaderObjectBase` and share the logic among different implementations.
+
+        auto& subObjectRanges = getLayout()->subObjectRanges;
+        // The following logic is built on the assumption that all fields that involve existential types (and
+        // therefore require specialization) will results in a sub-object range in the type layout.
+        // This allows us to simply scan the sub-object ranges to find out all specialization arguments.
+        for (Index subObjIndex = 0; subObjIndex < subObjectRanges.getCount(); subObjIndex++)
+        {
+            // Retrieve the corresponding binding range of the sub object.
+            auto bindingRange = getLayout()->m_bindingRanges[subObjectRanges[subObjIndex].bindingRangeIndex];
+            switch (bindingRange.bindingType)
+            {
+            case slang::BindingType::ExistentialValue:
+            {
+                // A binding type of `ExistentialValue` means the sub-object represents a interface-typed field.
+                // In this case the specialization argument for this field is the actual specialized type of the bound
+                // shader object. If the shader object's type is an ordinary type without existential fields, then the
+                // type argument will simply be the ordinary type. But if the sub object's type is itself a specialized
+                // type, we need to make sure to use that type as the specialization argument.
+
+                // TODO: need to implement the case where the field is an array of existential values.
+                SLANG_ASSERT(bindingRange.count == 1);
+                ExtendedShaderObjectType specializedSubObjType;
+                SLANG_RETURN_ON_FAIL(m_objects[subObjIndex]->getSpecializedShaderObjectType(&specializedSubObjType));
+                args.add(specializedSubObjType);
+                break;
+            }
+            case slang::BindingType::ParameterBlock:
+            case slang::BindingType::ConstantBuffer:
+                // Currently we only handle the case where the field's type is
+                // `ParameterBlock<SomeStruct>` or `ConstantBuffer<SomeStruct>`, where `SomeStruct` is a struct type
+                // (not directly an interface type). In this case, we just recursively collect the specialization arguments
+                // from the bound sub object.
+                SLANG_RETURN_ON_FAIL(m_objects[subObjIndex]->collectSpecializationArgs(args));
+                // TODO: we need to handle the case where the field is of the form `ParameterBlock<IFoo>`. We should treat
+                // this case the same way as the `ExistentialValue` case here, but currently we lack a mechanism to distinguish
+                // the two scenarios.
+                break;
+            }
+            // TODO: need to handle another case where specialization happens on resources fields e.g. `StructuredBuffer<IFoo>`.
+        }
+        return SLANG_OK;
+    }
+};
+
+class CPUEntryPointShaderObject : public CPUShaderObject
+{
+public:
+    CPUEntryPointLayout* getLayout() { return static_cast<CPUEntryPointLayout*>(m_layout.Ptr()); }
+};
+
+class CPURootShaderObject : public CPUShaderObject
+{
+public:
+    SlangResult init(IDevice* device, CPUProgramLayout* programLayout);
+
+    CPUProgramLayout* getLayout() { return static_cast<CPUProgramLayout*>(m_layout.Ptr()); }
+
+    CPUEntryPointShaderObject* getEntryPoint(Index index) { return m_entryPoints[index]; }
+
+    List<RefPtr<CPUEntryPointShaderObject>> m_entryPoints;
+
+    virtual SLANG_NO_THROW UInt SLANG_MCALL getEntryPointCount() override { return m_entryPoints.getCount(); }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        getEntryPoint(UInt index, IShaderObject** outEntryPoint) override
+    {
+        *outEntryPoint = ComPtr<IShaderObject>(m_entryPoints[index]).detach();
+        return SLANG_OK;
+    }
+    virtual Result collectSpecializationArgs(ExtendedShaderObjectTypeList& args) override
+    {
+        SLANG_RETURN_ON_FAIL(CPUShaderObject::collectSpecializationArgs(args));
+        for (auto& entryPoint : m_entryPoints)
+        {
+            SLANG_RETURN_ON_FAIL(entryPoint->collectSpecializationArgs(args));
+        }
+        return SLANG_OK;
+    }
+};
+
+class CPUShaderProgram : public ShaderProgramBase
+{
+public:
+    RefPtr<CPUProgramLayout> layout;
+
+    ~CPUShaderProgram()
+    {
+    }
+};
+
+class CPUPipelineState : public PipelineStateBase
+{
+public:
+    CPUShaderProgram* getProgram() { return static_cast<CPUShaderProgram*>(m_program.get()); }
+
+    void init(const ComputePipelineStateDesc& inDesc)
+    {
+        PipelineStateDesc pipelineDesc;
+        pipelineDesc.type = PipelineType::Compute;
+        pipelineDesc.compute = inDesc;
+        initializeBase(pipelineDesc);
+    }
+};
+
+class CPUDevice : public RendererBase
+{
+private:
+    RefPtr<CPUPipelineState> m_currentPipeline = nullptr;
+    RefPtr<CPURootShaderObject> m_currentRootObject = nullptr;
+    DeviceInfo m_info;
+
+    class CommandQueueImpl;
+
+    class CommandBufferImpl
+        : public ICommandBuffer
+        , public CommandWriter
+        , public RefObject
+    {
+    public:
+        SLANG_REF_OBJECT_IUNKNOWN_ALL
+        ICommandBuffer* getInterface(const Guid& guid)
+        {
+            if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandBuffer)
+                return static_cast<ICommandBuffer*>(this);
+            return nullptr;
+        }
+    public:
+        virtual SLANG_NO_THROW void SLANG_MCALL encodeRenderCommands(
+            IRenderPassLayout* renderPass,
+            IFramebuffer* framebuffer,
+            IRenderCommandEncoder** outEncoder) override
+        {
+            SLANG_UNUSED(renderPass);
+            SLANG_UNUSED(framebuffer);
+            *outEncoder = nullptr;
+        }
+
+        class ComputeCommandEncoderImpl
+            : public IComputeCommandEncoder
+        {
+        public:
+            virtual SLANG_NO_THROW SlangResult SLANG_MCALL
+                queryInterface(SlangUUID const& uuid, void** outObject) override
+            {
+                if (uuid == GfxGUID::IID_ISlangUnknown ||
+                    uuid == GfxGUID::IID_IComputeCommandEncoder)
+                {
+                    *outObject = static_cast<IComputeCommandEncoder*>(this);
+                    return SLANG_OK;
+                }
+                *outObject = nullptr;
+                return SLANG_E_NO_INTERFACE;
+            }
+            virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() { return 1; }
+            virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() { return 1; }
+
+        public:
+            CommandWriter* m_writer;
+
+            virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
+            void init(CommandBufferImpl* cmdBuffer)
+            {
+                m_writer = cmdBuffer;
+            }
+
+            virtual SLANG_NO_THROW void SLANG_MCALL setPipelineState(IPipelineState* state) override
+            {
+                m_writer->setPipelineState(state);
+            }
+            virtual SLANG_NO_THROW void SLANG_MCALL
+                bindRootShaderObject(IShaderObject* object) override
+            {
+                m_writer->bindRootShaderObject(PipelineType::Compute, object);
+            }
+
+            virtual SLANG_NO_THROW void SLANG_MCALL setDescriptorSet(
+                IPipelineLayout* layout,
+                UInt index,
+                IDescriptorSet* descriptorSet) override
+            {
+                m_writer->setDescriptorSet(PipelineType::Compute, layout, index, descriptorSet);
+            }
+
+            virtual SLANG_NO_THROW void SLANG_MCALL dispatchCompute(int x, int y, int z) override
+            {
+                m_writer->dispatchCompute(x, y, z);
+            }
+        };
+
+        ComputeCommandEncoderImpl m_computeCommandEncoder;
+        virtual SLANG_NO_THROW void SLANG_MCALL
+            encodeComputeCommands(IComputeCommandEncoder** outEncoder) override
+        {
+            m_computeCommandEncoder.init(this);
+            *outEncoder = &m_computeCommandEncoder;
+        }
+
+        class ResourceCommandEncoderImpl
+            : public IResourceCommandEncoder
+        {
+        public:
+            virtual SLANG_NO_THROW SlangResult SLANG_MCALL
+                queryInterface(SlangUUID const& uuid, void** outObject) override
+            {
+                if (uuid == GfxGUID::IID_ISlangUnknown ||
+                    uuid == GfxGUID::IID_IResourceCommandEncoder)
+                {
+                    *outObject = static_cast<IResourceCommandEncoder*>(this);
+                    return SLANG_OK;
+                }
+                *outObject = nullptr;
+                return SLANG_E_NO_INTERFACE;
+            }
+            virtual SLANG_NO_THROW uint32_t SLANG_MCALL addRef() { return 1; }
+            virtual SLANG_NO_THROW uint32_t SLANG_MCALL release() { return 1; }
+
+        public:
+            CommandWriter* m_writer;
+
+            void init(CommandBufferImpl* cmdBuffer)
+            {
+                m_writer = cmdBuffer;
+            }
+
+            virtual SLANG_NO_THROW void SLANG_MCALL endEncoding() override {}
+            virtual SLANG_NO_THROW void SLANG_MCALL copyBuffer(
+                IBufferResource* dst,
+                size_t dstOffset,
+                IBufferResource* src,
+                size_t srcOffset,
+                size_t size) override
+            {
+                m_writer->copyBuffer(dst, dstOffset, src, srcOffset, size);
+            }
+
+            virtual SLANG_NO_THROW void SLANG_MCALL
+                uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
+            {
+                m_writer->uploadBufferData(dst, offset, size, data);
+            }
+        };
+
+        ResourceCommandEncoderImpl m_resourceCommandEncoder;
+
+        virtual SLANG_NO_THROW void SLANG_MCALL
+            encodeResourceCommands(IResourceCommandEncoder** outEncoder) override
+        {
+            m_resourceCommandEncoder.init(this);
+            *outEncoder = &m_resourceCommandEncoder;
+        }
+
+        virtual SLANG_NO_THROW void SLANG_MCALL close() override {}
+    };
+
+    class CommandQueueImpl
+        : public ICommandQueue
+        , public RefObject
+    {
+    public:
+        SLANG_REF_OBJECT_IUNKNOWN_ALL
+        ICommandQueue* getInterface(const Guid& guid)
+        {
+            if (guid == GfxGUID::IID_ISlangUnknown || guid == GfxGUID::IID_ICommandQueue)
+                return static_cast<ICommandQueue*>(this);
+            return nullptr;
+        }
+
+    public:
+        RefPtr<CPUPipelineState> currentPipeline;
+        RefPtr<CPURootShaderObject> currentRootObject;
+        RefPtr<CPUDevice> renderer;
+        Desc m_desc;
+    public:
+        void init(CPUDevice* inRenderer)
+        {
+            renderer = inRenderer;
+            m_desc.type = ICommandQueue::QueueType::Graphics;
+        }
+        ~CommandQueueImpl()
+        {
+            currentPipeline = nullptr;
+            currentRootObject = nullptr;
+        }
+
+    public:
+        virtual SLANG_NO_THROW const Desc& SLANG_MCALL getDesc() override
+        {
+            return m_desc;
+        }
+        virtual SLANG_NO_THROW Result SLANG_MCALL
+            createCommandBuffer(ICommandBuffer** outCommandBuffer) override
+        {
+            RefPtr<CommandBufferImpl> result = new CommandBufferImpl();
+            *outCommandBuffer = result.detach();
+            return SLANG_OK;
+        }
+
+        virtual SLANG_NO_THROW void SLANG_MCALL
+            executeCommandBuffers(uint32_t count, ICommandBuffer* const* commandBuffers) override
+        {
+            for (uint32_t i = 0; i < count; i++)
+            {
+                execute(static_cast<CommandBufferImpl*>(commandBuffers[i]));
+            }
+        }
+
+        virtual SLANG_NO_THROW void SLANG_MCALL wait() override
+        {}
+
+    public:
+        void setPipelineState(IPipelineState* state)
+        {
+            currentPipeline = static_cast<CPUPipelineState*>(state);
+        }
+
+        Result bindRootShaderObject(PipelineType pipelineType, IShaderObject* object)
+        {
+            currentRootObject = static_cast<CPURootShaderObject*>(object);
+            if (currentRootObject)
+                return SLANG_OK;
+            return SLANG_E_INVALID_ARG;
+        }
+
+        void dispatchCompute(int x, int y, int z)
+        {
+            int entryPointIndex = 0;
+            int targetIndex = 0;
+
+            // Specialize the compute kernel based on the shader object bindings.
+            RefPtr<PipelineStateBase> newPipeline;
+            renderer->maybeSpecializePipeline(currentPipeline, currentRootObject, newPipeline);
+            currentPipeline = static_cast<CPUPipelineState*>(newPipeline.Ptr());
+
+            auto program = currentPipeline->getProgram();
+            auto entryPointLayout = currentRootObject->getLayout()->getEntryPoint(entryPointIndex);
+            auto entryPointName = entryPointLayout->getEntryPointName();
+
+            auto entryPointObject = currentRootObject->getEntryPoint(entryPointIndex);
+
+            ComPtr<ISlangSharedLibrary> sharedLibrary;
+            program->slangProgram->getEntryPointHostCallable(entryPointIndex, targetIndex, sharedLibrary.writeRef());
+
+            auto func = (slang_prelude::ComputeFunc) sharedLibrary->findSymbolAddressByName(entryPointName);
+
+            slang_prelude::ComputeVaryingInput varyingInput;
+            varyingInput.startGroupID.x = 0;
+            varyingInput.startGroupID.y = 0;
+            varyingInput.startGroupID.z = 0;
+            varyingInput.endGroupID.x = x;
+            varyingInput.endGroupID.y = y;
+            varyingInput.endGroupID.z = z;
+
+            auto globalParamsData = currentRootObject->m_data;
+            auto entryPointParamsData = entryPointObject->m_data;
+            func(&varyingInput, entryPointParamsData, globalParamsData);
+        }
+
+        void copyBuffer(
+            IBufferResource* dst,
+            size_t dstOffset,
+            IBufferResource* src,
+            size_t srcOffset,
+            size_t size)
+        {
+            auto dstImpl = static_cast<CPUBufferResource*>(dst);
+            auto srcImpl = static_cast<CPUBufferResource*>(src);
+            memcpy(
+                (uint8_t*)dstImpl->m_data + dstOffset,
+                (uint8_t*)srcImpl->m_data + srcOffset,
+                size);
+        }
+
+        void uploadBufferData(IBufferResource* dst, size_t offset, size_t size, void* data)
+        {
+            auto dstImpl = static_cast<CPUBufferResource*>(dst);
+            memcpy((uint8_t*)dstImpl->m_data + offset, data, size);
+        }
+
+        void execute(CommandBufferImpl* commandBuffer)
+        {
+            for (auto& cmd : commandBuffer->m_commands)
+            {
+                switch (cmd.name)
+                {
+                case CommandName::SetPipelineState:
+                    setPipelineState(commandBuffer->getObject<IPipelineState>(cmd.operands[0]));
+                    break;
+                case CommandName::BindRootShaderObject:
+                    bindRootShaderObject(
+                        (PipelineType)cmd.operands[0],
+                        commandBuffer->getObject<IShaderObject>(cmd.operands[1]));
+                    break;
+                case CommandName::DispatchCompute:
+                    dispatchCompute(
+                        int(cmd.operands[0]), int(cmd.operands[1]), int(cmd.operands[2]));
+                    break;
+                case CommandName::CopyBuffer:
+                    copyBuffer(
+                        commandBuffer->getObject<IBufferResource>(cmd.operands[0]),
+                        cmd.operands[1],
+                        commandBuffer->getObject<IBufferResource>(cmd.operands[2]),
+                        cmd.operands[3],
+                        cmd.operands[4]);
+                    break;
+                case CommandName::UploadBufferData:
+                    uploadBufferData(
+                        commandBuffer->getObject<IBufferResource>(cmd.operands[0]),
+                        cmd.operands[1],
+                        cmd.operands[2],
+                        commandBuffer->getData<uint8_t>(cmd.operands[3]));
+                    break;
+                }
+            }
+        }
+    };
+
+public:
+    ~CPUDevice()
+    {
+        m_currentPipeline = nullptr;
+        m_currentRootObject = nullptr;
+    }
+
+    virtual SLANG_NO_THROW SlangResult SLANG_MCALL initialize(const Desc& desc) override
+    {
+        SLANG_RETURN_ON_FAIL(slangContext.initialize(desc.slang, SLANG_HOST_CALLABLE, "sm_5_1"));
+
+        SLANG_RETURN_ON_FAIL(RendererBase::initialize(desc));
+
+        // Initialize DeviceInfo
+        {
+            m_info.deviceType = DeviceType::CPU;
+            m_info.bindingStyle = BindingStyle::CUDA;
+            m_info.projectionStyle = ProjectionStyle::DirectX;
+            m_info.apiName = "CPU";
+            static const float kIdentity[] = {1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
+            ::memcpy(m_info.identityProjectionMatrix, kIdentity, sizeof(kIdentity));
+            m_info.adapterName = "CPU";
+        }
+
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureResource(
+        IResource::Usage initialUsage,
+        const ITextureResource::Desc& desc,
+        const ITextureResource::SubresourceData* initData,
+        ITextureResource** outResource) override
+    {
+        RefPtr<CPUTextureResource> texture = new CPUTextureResource(desc);
+
+        SLANG_RETURN_ON_FAIL(texture->init(initData));
+
+        *outResource = texture.detach();
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferResource(
+        IResource::Usage initialUsage,
+        const IBufferResource::Desc& desc,
+        const void* initData,
+        IBufferResource** outResource) override
+    {
+        RefPtr<CPUBufferResource> resource = new CPUBufferResource(desc);
+        SLANG_RETURN_ON_FAIL(resource->init());
+        if (initData)
+        {
+            SLANG_RETURN_ON_FAIL(resource->setData(0, desc.sizeInBytes, initData));
+        }
+        *outResource = resource.detach();
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createTextureView(
+        ITextureResource* inTexture, IResourceView::Desc const& desc, IResourceView** outView) override
+    {
+        auto texture = static_cast<CPUTextureResource*>(inTexture);
+        RefPtr<CPUTextureView> view = new CPUTextureView(desc, texture);
+        *outView = view.detach();
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createBufferView(
+        IBufferResource* inBuffer, IResourceView::Desc const& desc, IResourceView** outView) override
+    {
+        auto buffer = static_cast<CPUBufferResource*>(inBuffer);
+        RefPtr<CPUBufferView> view = new CPUBufferView(desc, buffer);
+        *outView = view.detach();
+        return SLANG_OK;
+    }
+
+    virtual Result createShaderObjectLayout(
+        slang::TypeLayoutReflection*    typeLayout,
+        ShaderObjectLayoutBase**        outLayout) override
+    {
+        RefPtr<CPUShaderObjectLayout> cpuLayout = new CPUShaderObjectLayout(this, typeLayout);
+        *outLayout = cpuLayout.detach();
+
+        return SLANG_OK;
+    }
+
+    virtual Result createShaderObject(
+        ShaderObjectLayoutBase* layout,
+        IShaderObject**         outObject) override
+    {
+        auto cpuLayout = static_cast<CPUShaderObjectLayout*>(layout);
+
+        RefPtr<CPUShaderObject> result = new CPUShaderObject();
+        SLANG_RETURN_ON_FAIL(result->init(this, cpuLayout));
+        *outObject = result.detach();
+
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createRootShaderObject(IShaderProgram* program, IShaderObject** outObject) override
+    {
+        auto cpuProgram = static_cast<CPUShaderProgram*>(program);
+        auto cpuProgramLayout = cpuProgram->layout;
+
+        RefPtr<CPURootShaderObject> result = new CPURootShaderObject();
+        SLANG_RETURN_ON_FAIL(result->init(this, cpuProgramLayout));
+        *outObject = result.detach();
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createProgram(const IShaderProgram::Desc& desc, IShaderProgram** outProgram) override
+    {
+        if( desc.kernelCount == 0 )
+        {
+            return createProgramFromSlang(this, desc, outProgram);
+        }
+
+        if (desc.kernelCount != 1)
+            return SLANG_E_INVALID_ARG;
+
+        RefPtr<CPUShaderProgram> cpuProgram = new CPUShaderProgram();
+
+        // TODO: stuff?
+
+        auto slangProgram = desc.slangProgram;
+        if( slangProgram )
+        {
+            cpuProgram->slangProgram = slangProgram;
+
+            auto slangProgramLayout = slangProgram->getLayout();
+            if(!slangProgramLayout)
+                return SLANG_FAIL;
+
+            RefPtr<CPUProgramLayout> cpuProgramLayout = new CPUProgramLayout(this, slangProgramLayout);
+            cpuProgramLayout->m_programLayout = slangProgramLayout;
+
+            cpuProgram->layout = cpuProgramLayout;
+        }
+
+        *outProgram = cpuProgram.detach();
+        return SLANG_OK;
+    }
+
+    virtual SLANG_NO_THROW Result SLANG_MCALL createComputePipelineState(
+        const ComputePipelineStateDesc& desc, IPipelineState** outState) override
+    {
+        RefPtr<CPUPipelineState> state = new CPUPipelineState();
+        state->init(desc);
+        *outState = state.detach();
+        return Result();
+    }
+
+    virtual SLANG_NO_THROW const DeviceInfo& SLANG_MCALL getDeviceInfo() const override
+    {
+        return m_info;
+    }
+
+public:
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createCommandQueue(const ICommandQueue::Desc& desc, ICommandQueue** outQueue) override
+    {
+        RefPtr<CommandQueueImpl> queue = new CommandQueueImpl();
+        queue->init(this);
+        *outQueue = queue.detach();
+        return SLANG_OK;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL createSwapchain(
+        const ISwapchain::Desc& desc, WindowHandle window, ISwapchain** outSwapchain) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(window);
+        SLANG_UNUSED(outSwapchain);
+        return SLANG_FAIL;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL createFramebufferLayout(
+        const IFramebufferLayout::Desc& desc, IFramebufferLayout** outLayout) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outLayout);
+        return SLANG_FAIL;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createFramebuffer(const IFramebuffer::Desc& desc, IFramebuffer** outFramebuffer) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outFramebuffer);
+        return SLANG_FAIL;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL createRenderPassLayout(
+        const IRenderPassLayout::Desc& desc,
+        IRenderPassLayout** outRenderPassLayout) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outRenderPassLayout);
+        return SLANG_FAIL;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createSamplerState(ISamplerState::Desc const& desc, ISamplerState** outSampler) override
+    {
+        SLANG_UNUSED(desc);
+        *outSampler = nullptr;
+        return SLANG_OK;
+    }
+    
+    virtual SLANG_NO_THROW Result SLANG_MCALL createInputLayout(
+        const InputElementDesc* inputElements,
+        UInt inputElementCount,
+        IInputLayout** outLayout) override
+    {
+        SLANG_UNUSED(inputElements);
+        SLANG_UNUSED(inputElementCount);
+        SLANG_UNUSED(outLayout);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL createDescriptorSetLayout(
+        const IDescriptorSetLayout::Desc& desc, IDescriptorSetLayout** outLayout) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outLayout);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createPipelineLayout(const IPipelineLayout::Desc& desc, IPipelineLayout** outLayout) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outLayout);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL
+        createDescriptorSet(IDescriptorSetLayout* layout, IDescriptorSet::Flag::Enum flags, IDescriptorSet** outDescriptorSet) override
+    {
+        SLANG_UNUSED(layout);
+        SLANG_UNUSED(flags);
+        SLANG_UNUSED(outDescriptorSet);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL createGraphicsPipelineState(
+        const GraphicsPipelineStateDesc& desc, IPipelineState** outState) override
+    {
+        SLANG_UNUSED(desc);
+        SLANG_UNUSED(outState);
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SLANG_NO_THROW SlangResult SLANG_MCALL readTextureResource(
+        ITextureResource* texture,
+        ResourceState state,
+        ISlangBlob** outBlob,
+        size_t* outRowPitch,
+        size_t* outPixelSize) override
+    {
+        SLANG_UNUSED(texture);
+        SLANG_UNUSED(outBlob);
+        SLANG_UNUSED(outRowPitch);
+        SLANG_UNUSED(outPixelSize);
+
+        return SLANG_E_NOT_AVAILABLE;
+    }
+    virtual SLANG_NO_THROW Result SLANG_MCALL readBufferResource(
+        IBufferResource* buffer,
+        size_t offset,
+        size_t size,
+        ISlangBlob** outBlob) override
+    {
+        auto bufferImpl = static_cast<CPUBufferResource*>(buffer);
+        RefPtr<ListBlob> blob = new ListBlob();
+        blob->m_data.setCount((Index)size);
+        memcpy(
+            blob->m_data.getBuffer(),
+            (uint8_t*)bufferImpl->m_data + offset,
+            size);
+        *outBlob = blob.detach();
+        return SLANG_OK;
+    }
+};
+
+SlangResult CPUShaderObject::init(IDevice* device, CPUShaderObjectLayout* typeLayout)
+{
+    m_layout = typeLayout;
+
+    // If the layout tells us that there is any uniform data,
+    // then we need to allocate a constant buffer to hold that data.
+    //
+    // TODO: Do we need to allocate a shadow copy for use from
+    // the CPU?
+    //
+    // TODO: When/where do we bind this constant buffer into
+    // a descriptor set for later use?
+    //
+    auto slangLayout = getLayout()->getElementTypeLayout();
+    size_t uniformSize = slangLayout->getSize();
+    if (uniformSize)
+    {
+        m_data = malloc(uniformSize);
+    }
+
+    // If the layout specifies that we have any resources or sub-objects,
+    // then we need to size the appropriate arrays to account for them.
+    //
+    // Note: the counts here are the *total* number of resources/sub-objects
+    // and not just the number of resource/sub-object ranges.
+    //
+    m_resources.setCount(typeLayout->getResourceCount());
+    m_objects.setCount(typeLayout->getSubObjectCount());
+
+    for (auto subObjectRange : getLayout()->subObjectRanges)
+    {
+        RefPtr<CPUShaderObjectLayout> subObjectLayout = subObjectRange.layout;
+
+        // In the case where the sub-object range represents an
+        // existential-type leaf field (e.g., an `IBar`), we
+        // cannot pre-allocate the object(s) to go into that
+        // range, since we can't possibly know what to allocate
+        // at this point.
+        //
+        if (!subObjectLayout)
+            continue;
+        //
+        // Otherwise, we will allocate a sub-object to fill
+        // in each entry in this range, based on the layout
+        // information we already have.
+
+        auto& bindingRangeInfo = getLayout()->m_bindingRanges[subObjectRange.bindingRangeIndex];
+        for (Index i = 0; i < bindingRangeInfo.count; ++i)
+        {
+            RefPtr<CPUShaderObject> subObject = new CPUShaderObject();
+            SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
+
+            ShaderOffset offset;
+            offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
+            offset.bindingRangeIndex = subObjectRange.bindingRangeIndex;
+            offset.bindingArrayIndex = i;
+
+            SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
+        }
+    }
+    return SLANG_OK;
+}
+
+SlangResult CPURootShaderObject::init(IDevice* device, CPUProgramLayout* programLayout)
+{
+    SLANG_RETURN_ON_FAIL(CPUShaderObject::init(device, programLayout));
+    for (auto& entryPoint : programLayout->m_entryPointLayouts)
+    {
+        RefPtr<CPUEntryPointShaderObject> object = new CPUEntryPointShaderObject();
+        SLANG_RETURN_ON_FAIL(object->init(device, entryPoint));
+        m_entryPoints.add(object);
+    }
+    return SLANG_OK;
+}
+
+SlangResult SLANG_MCALL createCPUDevice(const IDevice::Desc* desc, IDevice** outDevice)
+{
+    RefPtr<CPUDevice> result = new CPUDevice();
+    SLANG_RETURN_ON_FAIL(result->initialize(*desc));
+    *outDevice = result.detach();
+    return SLANG_OK;
+}
+
+}
diff --git a/tools/gfx/cpu/render-cpu.h b/tools/gfx/cpu/render-cpu.h
new file mode 100644
index 000000000..fca57aa4d
--- /dev/null
+++ b/tools/gfx/cpu/render-cpu.h
@@ -0,0 +1,11 @@
+// render-cpu.h
+#pragma once
+
+#include "../renderer-shared.h"
+
+namespace gfx
+{
+
+SlangResult SLANG_MCALL createCPUDevice(const IDevice::Desc* desc, IDevice** outDevice);
+
+}
diff --git a/tools/gfx/cuda/render-cuda.cpp b/tools/gfx/cuda/render-cuda.cpp
index d13045359..89aaa33aa 100644
--- a/tools/gfx/cuda/render-cuda.cpp
+++ b/tools/gfx/cuda/render-cuda.cpp
@@ -278,36 +278,17 @@ public:
     List<SubObjectRangeInfo> subObjectRanges;
     List<BindingRangeInfo> m_bindingRanges;
 
-    slang::TypeLayoutReflection* unwrapParameterGroups(slang::TypeLayoutReflection* typeLayout)
-    {
-        for (;;)
-        {
-            if (!typeLayout->getType())
-            {
-                if (auto elementTypeLayout = typeLayout->getElementTypeLayout())
-                    typeLayout = elementTypeLayout;
-            }
-
-            switch (typeLayout->getKind())
-            {
-            default:
-                return typeLayout;
-
-            case slang::TypeReflection::Kind::ConstantBuffer:
-            case slang::TypeReflection::Kind::ParameterBlock:
-                typeLayout = typeLayout->getElementTypeLayout();
-                continue;
-            }
-        }
-    }
+    Index m_subObjectCount = 0;
+    Index m_resourceCount = 0;
 
     CUDAShaderObjectLayout(RendererBase* renderer, slang::TypeLayoutReflection* layout)
     {
         initBase(renderer, layout);
 
         Index subObjectCount = 0;
+        Index resourceCount = 0;
 
-        m_elementTypeLayout = unwrapParameterGroups(layout);
+        m_elementTypeLayout = _unwrapParameterGroups(layout);
 
         // Compute the binding ranges that are used to store
         // the logical contents of the object in memory. These will relate
@@ -348,6 +329,8 @@ public:
                 break;
 
             default:
+                baseIndex = resourceCount;
+                resourceCount += count;
                 break;
             }
 
@@ -359,6 +342,9 @@ public:
             m_bindingRanges.add(bindingRangeInfo);
         }
 
+        m_subObjectCount = subObjectCount;
+        m_resourceCount = resourceCount;
+
         SlangInt subObjectRangeCount = m_elementTypeLayout->getSubObjectRangeCount();
         for (SlangInt r = 0; r < subObjectRangeCount; ++r)
         {
@@ -387,6 +373,9 @@ public:
             subObjectRanges.add(subObjectRange);
         }
     }
+
+    Index getResourceCount() const { return m_resourceCount; }
+    Index getSubObjectCount() const { return m_subObjectCount; }
 };
 
 class CUDAProgramLayout : public CUDAShaderObjectLayout
@@ -503,6 +492,11 @@ public:
     {
         auto subObjectIndex =
             getLayout()->m_bindingRanges[offset.bindingRangeIndex].baseIndex + offset.bindingArrayIndex;
+
+        SLANG_ASSERT(subObjectIndex < objects.getCount());
+        if(subObjectIndex >= objects.getCount())
+            return SLANG_E_INVALID_ARG;
+
         if (subObjectIndex >= objects.getCount())
         {
             *object = nullptr;
@@ -525,8 +519,6 @@ public:
 
         auto subObjectIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
         auto subObject = dynamic_cast<CUDAShaderObject*>(object);
-        if (subObjectIndex >= objects.getCount())
-            objects.setCount(subObjectIndex + 1);
 
         // TODO: We should really not need to retain the objects here
         objects[subObjectIndex] = subObject;
@@ -635,10 +627,19 @@ public:
     virtual SLANG_NO_THROW Result SLANG_MCALL
         setResource(ShaderOffset const& offset, IResourceView* resourceView)
     {
+        auto layout = getLayout();
+
+        auto bindingRangeIndex = offset.bindingRangeIndex;
+        SLANG_ASSERT(bindingRangeIndex >= 0);
+        SLANG_ASSERT(bindingRangeIndex < layout->m_bindingRanges.getCount());
+
+        auto& bindingRange = layout->m_bindingRanges[bindingRangeIndex];
+
+        auto viewIndex = bindingRange.baseIndex + offset.bindingArrayIndex;
         auto cudaView = dynamic_cast<CUDAResourceView*>(resourceView);
-        if (offset.bindingRangeIndex >= resources.getCount())
-            resources.setCount(offset.bindingRangeIndex + 1);
-        resources[offset.bindingRangeIndex] = cudaView;
+
+        resources[viewIndex] = cudaView;
+
         if (cudaView->textureResource)
         {
             if (cudaView->desc.type == IResourceView::Type::UnorderedAccess)
@@ -2059,9 +2060,15 @@ SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* type
         initBuffer(device, uniformSize);
     }
 
-    // If the layout specifies that we have any sub-objects, then
-    // we need to size the array to account for them.
+    // If the layout specifies that we have any resources or sub-objects,
+    // then we need to size the appropriate arrays to account for them.
+    //
+    // Note: the counts here are the *total* number of resources/sub-objects
+    // and not just the number of resource/sub-object ranges.
     //
+    resources.setCount(typeLayout->getResourceCount());
+    objects.setCount(typeLayout->getSubObjectCount());
+
     Index subObjectCount = slangLayout->getSubObjectRangeCount();
     objects.setCount(subObjectCount);
 
@@ -2087,11 +2094,13 @@ SlangResult CUDAShaderObject::init(IDevice* device, CUDAShaderObjectLayout* type
         {
             RefPtr<CUDAShaderObject> subObject = new CUDAShaderObject();
             SLANG_RETURN_ON_FAIL(subObject->init(device, subObjectLayout));
-            objects[bindingRangeInfo.baseIndex + i] = subObject;
+
             ShaderOffset offset;
             offset.uniformOffset = bindingRangeInfo.uniformOffset + sizeof(void*) * i;
-            if (subObject->bufferResource)
-                SLANG_RETURN_ON_FAIL(setData(offset, &subObject->bufferResource->m_cudaMemory, sizeof(void*)));
+            offset.bindingRangeIndex = subObjectRange.bindingRangeIndex;
+            offset.bindingArrayIndex = i;
+
+            SLANG_RETURN_ON_FAIL(setObject(offset, subObject));
         }
     }
     return SLANG_OK;
diff --git a/tools/gfx/render-graphics-common.cpp b/tools/gfx/render-graphics-common.cpp
index 10713e92b..5ae148ea0 100644
--- a/tools/gfx/render-graphics-common.cpp
+++ b/tools/gfx/render-graphics-common.cpp
@@ -109,29 +109,6 @@ public:
             }
         }
 
-        slang::TypeLayoutReflection* unwrapParameterGroups(slang::TypeLayoutReflection* typeLayout)
-        {
-            for (;;)
-            {
-                if (!typeLayout->getType())
-                {
-                    if (auto elementTypeLayout = typeLayout->getElementTypeLayout())
-                        typeLayout = elementTypeLayout;
-                }
-
-                switch (typeLayout->getKind())
-                {
-                default:
-                    return typeLayout;
-
-                case slang::TypeReflection::Kind::ConstantBuffer:
-                case slang::TypeReflection::Kind::ParameterBlock:
-                    typeLayout = typeLayout->getElementTypeLayout();
-                    continue;
-                }
-            }
-        }
-
         void _addDescriptorSets(
             slang::TypeLayoutReflection* typeLayout,
             slang::VariableLayoutReflection* varLayout = nullptr)
@@ -178,7 +155,7 @@ public:
 
         Result setElementTypeLayout(slang::TypeLayoutReflection* typeLayout)
         {
-            typeLayout = unwrapParameterGroups(typeLayout);
+            typeLayout = _unwrapParameterGroups(typeLayout);
 
             m_elementTypeLayout = typeLayout;
 
@@ -414,8 +391,8 @@ public:
 
     struct Builder : Super::Builder
     {
-        Builder(IDevice* renderer)
-            : Super::Builder(static_cast<RendererBase*>(renderer))
+        Builder(IDevice* device)
+            : Super::Builder(static_cast<RendererBase*>(device))
         {}
 
         Result build(EntryPointLayout** outLayout)
@@ -1249,7 +1226,7 @@ protected:
             return SLANG_OK;
 
         // Once we have computed how large the buffer should be, we can allocate
-        // it using the existing public `IRenderer` API.
+        // it using the existing public `IDevice` API.
         //
         IDevice* device = getRenderer();
         IBufferResource::Desc bufferDesc;
diff --git a/tools/gfx/render.cpp b/tools/gfx/render.cpp
index 4ecb52287..e7d30b728 100644
--- a/tools/gfx/render.cpp
+++ b/tools/gfx/render.cpp
@@ -7,6 +7,7 @@
 #include "open-gl/render-gl.h"
 #include "vulkan/render-vk.h"
 #include "cuda/render-cuda.h"
+#include "cpu/render-cpu.h"
 #include <cstring>
 
 namespace gfx {
@@ -97,6 +98,11 @@ extern "C"
                 return createVKDevice(desc, outDevice);
             }
 #endif
+        case DeviceType::CPU:
+            {
+                return createCPUDevice(desc, outDevice);
+            }
+            break;
 
         default:
             return SLANG_FAIL;
@@ -154,5 +160,4 @@ extern "C"
     }
 }
 
-
 } // renderer_test
diff --git a/tools/gfx/renderer-shared.h b/tools/gfx/renderer-shared.h
index 9fe9768f4..2a77dcb93 100644
--- a/tools/gfx/renderer-shared.h
+++ b/tools/gfx/renderer-shared.h
@@ -149,6 +149,30 @@ protected:
     slang::TypeLayoutReflection* m_elementTypeLayout = nullptr;
     ShaderComponentID m_componentID = 0;
 
+    static slang::TypeLayoutReflection* _unwrapParameterGroups(slang::TypeLayoutReflection* typeLayout)
+    {
+        for (;;)
+        {
+            if (!typeLayout->getType())
+            {
+                if (auto elementTypeLayout = typeLayout->getElementTypeLayout())
+                    typeLayout = elementTypeLayout;
+            }
+
+            switch (typeLayout->getKind())
+            {
+            default:
+                return typeLayout;
+
+            case slang::TypeReflection::Kind::ConstantBuffer:
+            case slang::TypeReflection::Kind::ParameterBlock:
+                typeLayout = typeLayout->getElementTypeLayout();
+                continue;
+            }
+        }
+    }
+
+
 public:
     RendererBase* getDevice() { return m_renderer; }
 
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp
index 7c9103cb3..6682eef1a 100644
--- a/tools/render-test/cpu-compute-util.cpp
+++ b/tools/render-test/cpu-compute-util.cpp
@@ -52,15 +52,15 @@ struct ValueTexture : public CPUComputeUtil::Resource, public CPPPrelude::ITextu
     {
         return _calcMipDims(mipLevel, m_dims);
     }
-    virtual void Load(const int32_t* loc, void* out) SLANG_OVERRIDE
+    virtual void Load(const int32_t* loc, void* out, size_t dataSize) SLANG_OVERRIDE
     {
         _set(out);
     }
-    virtual void Sample(CPPPrelude::SamplerState samplerState, const float* loc, void* out) SLANG_OVERRIDE
+    virtual void Sample(CPPPrelude::SamplerState samplerState, const float* loc, void* out, size_t dataSize) SLANG_OVERRIDE
     {
         _set(out);
     }
-    virtual void SampleLevel(CPPPrelude::SamplerState samplerState, const float* loc, float level, void* out) SLANG_OVERRIDE
+    virtual void SampleLevel(CPPPrelude::SamplerState samplerState, const float* loc, float level, void* out, size_t dataSize) SLANG_OVERRIDE
     {
         _set(out);
     }
@@ -201,9 +201,15 @@ struct FloatRWTexture : public CPUComputeUtil::Resource, public CPPPrelude::IRWT
     {
         return _calcMipDims(mipLevel, m_dims);
     }
-    virtual void Load(const int32_t* loc, void* out) SLANG_OVERRIDE { m_data.getAt((const uint32_t*)loc, (float*)out); }
+    virtual void Load(const int32_t* loc, void* out, size_t dataSize) SLANG_OVERRIDE { m_data.getAt((const uint32_t*)loc, (float*)out); }
     virtual void* refAt(const uint32_t* loc) SLANG_OVERRIDE { return m_data.getAt(loc); }
 
+    virtual void Sample(CPPPrelude::SamplerState samplerState, const float* loc, void* out, size_t dataSize) SLANG_OVERRIDE
+    {}
+
+    virtual void SampleLevel(CPPPrelude::SamplerState samplerState, const float* loc, float level, void* out, size_t dataSize) SLANG_OVERRIDE
+    {}
+
     FloatRWTexture(int elementCount, const CPPPrelude::TextureDimensions& inDims, float initialValue):
         m_dims(inDims)
     {
diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp
index e13642c5c..15100e2a5 100644
--- a/tools/render-test/render-test-main.cpp
+++ b/tools/render-test/render-test-main.cpp
@@ -1294,7 +1294,7 @@ static SlangResult _innerMain(Slang::StdWriters* stdWriters, SlangSession* sessi
     }
 
     // If it's CPU testing we don't need a window or a renderer
-    if (options.deviceType == DeviceType::CPU)
+    if (options.deviceType == DeviceType::CPU && !options.useShaderObjects)
     {
         // Check we have all the required features
         for (const auto& renderFeature : options.renderFeatures)
diff --git a/tools/render-test/shader-renderer-util.h b/tools/render-test/shader-renderer-util.h
index ecb8fc8bb..9d583331f 100644
--- a/tools/render-test/shader-renderer-util.h
+++ b/tools/render-test/shader-renderer-util.h
@@ -73,13 +73,13 @@ struct ShaderRendererUtil
         bool isOutput,
         size_t bufferSize,
         const void* initData,
-        IDevice* renderer,
+        IDevice* device,
         ComPtr<IBufferResource>& bufferOut);
 
         /// Create BindingState::Desc from the contents of layout
     static Slang::Result createBindingState(
         const ShaderInputLayout& layout,
-        IDevice* renderer,
+        IDevice* device,
         IBufferResource* addedConstantBuffer,
         BindingStateImpl** outBindingState);
 };
author	Tim Foley <tfoleyNV@users.noreply.github.com>	2021-03-12 11:58:14 -0800
committer	GitHub <noreply@github.com>	2021-03-12 11:58:14 -0800
commit	d6a37a0f151e390808f196998c48a341bc4c7b60 (patch)
tree	c1c6e3af434cb3627af67ecc8706124e4b8c7fb1
parent	9ffe2f3ef245034a2dae42017a9059dfe4d02647 (diff)