11 files changed, 261 insertions, 37 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md
index 01803c145..9c82b1dc9 100644
--- a/docs/cuda-target.md
+++ b/docs/cuda-target.md
@@ -97,7 +97,26 @@ The UniformState and UniformEntryPointParams struct typically vary by shader. Un
 
 ## Unsized arrays
 
-WIP: Not implemented yet.
+Unsized arrays can be used, which are indicated by an array with no size as in `[]`. For example 
+
+```
+    RWStructuredBuffer<int> arrayOfArrays[];
+```
+
+With normal 'sized' arrays, the elements are just stored contiguously within wherever they are defined. With an unsized array they map to `Array<T>` which is...
+
+```
+    T* data;
+    size_t count;
+```    
+
+Note that there is no method in the shader source to get the `count`, even though on the CUDA target it is stored and easily available. This is because of the behavior on GPU targets 
+
+* That the count has to be stored elsewhere (unlike with CUDA) 
+* On some GPU targets there is no bounds checking - accessing outside the bound values can cause *undefined behavior*
+* The elements may be laid out *contiguously* on GPU
+
+In practice this means if you want to access the `count` in shader code it will need to be passed by another mechanism - such as within a constant buffer. It is possible in the future support may be added to allow direct access of `count` work across targets transparently. 
 
 ## Prelude
 
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 63388c7f3..7e6e5957d 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -38,6 +38,18 @@ struct FixedArray
     T m_data[SIZE];
 };
 
+// An array that has no specified size, becomes a 'Array'. This stores the size so it can potentially 
+// do bounds checking.  
+template <typename T>
+struct Array
+{
+    SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
+    SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; }
+    
+    T* data;
+    size_t count;
+};
+
 // Typically defined in cuda.h, but we can't ship/rely on that, so just define here
 typedef unsigned long long CUtexObject;                   
 typedef unsigned long long CUsurfObject;                  
@@ -49,6 +61,11 @@ typedef unsigned long long CUsurfObject;
 struct SamplerStateUnused;
 typedef SamplerStateUnused* SamplerState;
 
+
+// TODO(JS): Not clear yet if this can be handled on CUDA, by just ignoring.
+// For now, just map to the index type. 
+typedef size_t NonUniformResourceIndex;
+
 // Code generator will generate the specific type
 template <typename T, int ROWS, int COLS>
 struct Matrix;
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index 66e5714c3..2a216994b 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -518,6 +518,7 @@ String emitEntryPointSourceFromIR(
 
         case SourceStyle::CPP:
         case SourceStyle::C:
+        case SourceStyle::CUDA:
             linkingAndOptimizationOptions.shouldLegalizeExistentialAndResourceTypes = false;
             break;
         }
diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp
index 2eec26ee6..08c789609 100644
--- a/source/slang/slang-type-layout.cpp
+++ b/source/slang/slang-type-layout.cpp
@@ -112,8 +112,9 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
         return arrayInfo;
     }
 
-    SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) override
+    SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override
     {
+        SLANG_UNUSED(elementType);
         SimpleLayoutInfo vectorInfo;
         vectorInfo.kind = elementInfo.kind;
         vectorInfo.size = elementInfo.size * elementCount;
@@ -121,7 +122,7 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
         return vectorInfo;
     }
 
-    SimpleArrayLayoutInfo GetMatrixLayout(SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override
+    SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override
     {
         // The default behavior here is to lay out a matrix
         // as an array of row vectors (that is row-major).
@@ -131,7 +132,7 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
         // to get layouts with a different convention.
         //
         return GetArrayLayout(
-            GetVectorLayout(elementInfo, columnCount),
+            GetVectorLayout(elementType, elementInfo, columnCount),
             rowCount);
     }
 
@@ -204,8 +205,9 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl
 {
     typedef DefaultLayoutRulesImpl Super;
 
-    SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) override
+    SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override
     {
+        SLANG_UNUSED(elementType);
         // The `std140` and `std430` rules require vectors to be aligned to the next power of
         // two up from their size (so a `float2` is 8-byte aligned, and a `float3` is
         // 16-byte aligned).
@@ -224,7 +226,7 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl
         return vectorInfo;
     }
 
-    SimpleArrayLayoutInfo GetArrayLayout( SimpleLayoutInfo elementInfo, LayoutSize elementCount) override
+    SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) override
     {
         // The size of an array must be rounded up to be a multiple of its alignment.
         //
@@ -376,7 +378,7 @@ struct CPULayoutRulesImpl : DefaultLayoutRulesImpl
 
             // So it is actually a Array<T> on CPU which is a pointer and a size
             info.size = sizeof(void*) * 2;
-            info.alignment = sizeof(void*);
+            info.alignment = SLANG_ALIGN_OF(void*);
 
             return info;
         }
@@ -398,12 +400,115 @@ struct CPULayoutRulesImpl : DefaultLayoutRulesImpl
     }
 };
 
-// TODO(JS): Most likely wrong. For layout for CUDA, we'll just do the default to get things up and running
 struct CUDALayoutRulesImpl : DefaultLayoutRulesImpl
 {
     typedef DefaultLayoutRulesImpl Super;
-};
 
+    SimpleLayoutInfo GetScalarLayout(BaseType baseType) override
+    {
+        switch (baseType)
+        {
+            case BaseType::Bool:
+            {
+                // In memory a bool is a byte. BUT when in a vector or matrix it will actually be a int32_t
+                return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(uint8_t), SLANG_ALIGN_OF(uint8_t));
+            }
+
+            default: return Super::GetScalarLayout(baseType);
+        }
+    }
+
+    SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) override
+    {
+        SLANG_RELEASE_ASSERT(elementInfo.size.isFinite());
+        auto elementSize = elementInfo.size.getFiniteValue();
+        auto elementAlignment = elementInfo.alignment;
+        auto elementStride = RoundToAlignment(elementSize, elementAlignment);
+
+        if (elementCount.isInfinite())
+        {
+            // This is an unsized array, get information for element
+            auto info = Super::GetArrayLayout(elementInfo, LayoutSize(1));
+
+            // So it is actually a Array<T> on CUDA which is a pointer and a size
+            info.size = sizeof(void*) * 2;
+            info.alignment = SLANG_ALIGN_OF(void*);
+            return info;
+        }
+
+        // An array with no elements will have zero size.
+        //
+        LayoutSize arraySize = 0;
+        //
+        // Any array with a non-zero number of elements will need
+        // to have space for N elements of size `elementSize`, with
+        // the constraints that there must be `elementStride` bytes
+        // between consecutive elements.
+        //
+        if (elementCount > 0)
+        {
+            // We can think of this as either allocating (N-1)
+            // chunks of size `elementStride` (for most of the elements)
+            // and then one final chunk of size `elementSize`  for
+            // the last element, or equivalently as allocating
+            // N chunks of size `elementStride` and then "giving back"
+            // the final `elementStride - elementSize` bytes.
+            //
+            arraySize = (elementStride * (elementCount - 1)) + elementSize;
+        }
+
+        SimpleArrayLayoutInfo arrayInfo;
+        arrayInfo.kind = elementInfo.kind;
+        arrayInfo.size = arraySize;
+        arrayInfo.alignment = elementAlignment;
+        arrayInfo.elementStride = elementStride;
+        return arrayInfo;
+    }
+
+    SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override
+    {
+        // Special case bool
+        if (elementType == BaseType::Bool)
+        {
+            SimpleLayoutInfo fixInfo(elementInfo);
+            fixInfo.size = sizeof(int32_t);
+            fixInfo.alignment = SLANG_ALIGN_OF(int32_t);
+            return GetVectorLayout(BaseType::Int, fixInfo, elementCount);
+        }
+
+        SimpleLayoutInfo vectorInfo;
+        vectorInfo.kind = elementInfo.kind;
+        vectorInfo.size = elementInfo.size * elementCount;
+        vectorInfo.alignment = elementInfo.alignment;
+    
+        return vectorInfo;
+    }
+
+    SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override
+    {
+        // Special case bool
+        if (elementType == BaseType::Bool)
+        {
+            SimpleLayoutInfo fixInfo(elementInfo);
+            fixInfo.size = sizeof(int32_t);
+            fixInfo.alignment = SLANG_ALIGN_OF(int32_t);
+            return GetMatrixLayout(BaseType::Int, fixInfo, rowCount, columnCount);
+        }
+
+        return Super::GetMatrixLayout(elementType, elementInfo, rowCount, columnCount);
+    }
+
+    UniformLayoutInfo BeginStructLayout() override
+    {
+        return Super::BeginStructLayout();
+    }
+
+    void EndStructLayout(UniformLayoutInfo* ioStructInfo) override
+    {
+        // Conform to CUDA/C/C++ size is adjusted to the largest alignment
+        ioStructInfo->size = RoundToAlignment(ioStructInfo->size, ioStructInfo->alignment);
+    }
+};
 
 struct HLSLStructuredBufferLayoutRulesImpl : DefaultLayoutRulesImpl
 {
@@ -436,8 +541,9 @@ struct DefaultVaryingLayoutRulesImpl : DefaultLayoutRulesImpl
             1);
     }
 
-    SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo, size_t) override
+    SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo, size_t) override
     {
+        SLANG_UNUSED(elementType);
         // Vectors take up one slot by default
         //
         // TODO: some platforms may decide that vectors of `double` need
@@ -479,8 +585,9 @@ struct GLSLSpecializationConstantLayoutRulesImpl : DefaultLayoutRulesImpl
             1);
     }
 
-    SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo, size_t elementCount) override
+    SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo, size_t elementCount) override
     {
+        SLANG_UNUSED(elementType);
         // GLSL doesn't support vectors of specialization constants,
         // but we will assume that, if supported, they would use one slot per element.
         return SimpleLayoutInfo(
@@ -3052,7 +3159,13 @@ static TypeLayoutResult _createTypeLayout(
             context,
             elementType);
 
-        auto info = rules->GetVectorLayout(element.info, elementCount);
+        BaseType elementBaseType = BaseType::Void;
+        if (auto elementBasicType = as<BasicExpressionType>(elementType))
+        {
+            elementBaseType = elementBasicType->baseType;
+        }
+
+        auto info = rules->GetVectorLayout(elementBaseType, element.info, elementCount);
 
         RefPtr<VectorTypeLayout> typeLayout = new VectorTypeLayout();
         typeLayout->type = type;
@@ -3078,6 +3191,12 @@ static TypeLayoutResult _createTypeLayout(
         auto elementTypeLayout = elementResult.layout;
         auto elementInfo = elementResult.info;
 
+        BaseType elementBaseType = BaseType::Void;
+        if (auto elementBasicType = as<BasicExpressionType>(elementType))
+        {
+            elementBaseType = elementBasicType->baseType;
+        }
+
         // The `GetMatrixLayout` implementation in the layout rules
         // currently defaults to assuming row-major layout,
         // so if we want column-major layout we achieve it here by
@@ -3092,6 +3211,7 @@ static TypeLayoutResult _createTypeLayout(
             layoutMinorCount = tmp;
         }
         auto info = rules->GetMatrixLayout(
+            elementBaseType,
             elementInfo,
             layoutMajorCount,
             layoutMinorCount);
@@ -3100,6 +3220,7 @@ static TypeLayoutResult _createTypeLayout(
         RefPtr<VectorTypeLayout> rowTypeLayout = new VectorTypeLayout();
 
         auto rowInfo = rules->GetVectorLayout(
+            elementBaseType,
             elementInfo,
             colCount);
 
@@ -3680,7 +3801,7 @@ RefPtr<TypeLayout> getSimpleVaryingParameterTypeLayout(
         {
             auto varyingRuleSet = varyingRules[rr];
             auto elementInfo = varyingRuleSet->GetScalarLayout(elementBaseType);
-            auto info = varyingRuleSet->GetVectorLayout(elementInfo, elementCount);
+            auto info = varyingRuleSet->GetVectorLayout(elementBaseType, elementInfo, elementCount);
             typeLayout->addResourceUsage(info.kind, info.size);
         }
 
@@ -3735,14 +3856,14 @@ RefPtr<TypeLayout> getSimpleVaryingParameterTypeLayout(
             auto varyingRuleSet = varyingRules[rr];
             auto elementInfo = varyingRuleSet->GetScalarLayout(elementBaseType);
 
-            auto info = varyingRuleSet->GetMatrixLayout(elementInfo, layoutMajorCount, layoutMinorCount);
+            auto info = varyingRuleSet->GetMatrixLayout(elementBaseType, elementInfo, layoutMajorCount, layoutMinorCount);
             typeLayout->addResourceUsage(info.kind, info.size);
 
             if(context.matrixLayoutMode == kMatrixLayoutMode_RowMajor)
             {
                 // For row-major matrices only, we can compute an effective
                 // resource usage for the row type.
-                auto rowInfo = varyingRuleSet->GetVectorLayout(elementInfo, colCount);
+                auto rowInfo = varyingRuleSet->GetVectorLayout(elementBaseType, elementInfo, colCount);
                 rowTypeLayout->addResourceUsage(rowInfo.kind, rowInfo.size);
             }
         }
diff --git a/source/slang/slang-type-layout.h b/source/slang/slang-type-layout.h
index afca06ce1..2cc6f338c 100644
--- a/source/slang/slang-type-layout.h
+++ b/source/slang/slang-type-layout.h
@@ -814,8 +814,8 @@ struct SimpleLayoutRulesImpl
     virtual SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) = 0;
 
     // Get layout for a vector or matrix type
-    virtual SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) = 0;
-    virtual SimpleArrayLayoutInfo GetMatrixLayout(SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) = 0;
+    virtual SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) = 0;
+    virtual SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) = 0;
 
     // Begin doing layout on a `struct` type
     virtual UniformLayoutInfo BeginStructLayout() = 0;
@@ -851,14 +851,14 @@ struct LayoutRulesImpl
         return simpleRules->GetArrayLayout(elementInfo, elementCount);
     }
 
-    SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount)
+    SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount)
     {
-        return simpleRules->GetVectorLayout(elementInfo, elementCount);
+        return simpleRules->GetVectorLayout(elementType, elementInfo, elementCount);
     }
 
-    SimpleArrayLayoutInfo GetMatrixLayout(SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount)
+    SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount)
     {
-        return simpleRules->GetMatrixLayout(elementInfo, rowCount, columnCount);
+        return simpleRules->GetMatrixLayout(elementType, elementInfo, rowCount, columnCount);
     }
 
     UniformLayoutInfo BeginStructLayout()
diff --git a/tests/compute/buffer-layout.slang.4.expected.txt b/tests/compute/buffer-layout.slang.4.expected.txt
new file mode 100644
index 000000000..4fcded8c6
--- /dev/null
+++ b/tests/compute/buffer-layout.slang.4.expected.txt
@@ -0,0 +1,4 @@
+13080308
+23080308
+33080308
+43080308
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index aa82d8d70..a50295063 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -572,13 +572,15 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
                         auto elementCount = int(typeLayout->getElementCount());
                         if (elementCount == 0)
                         {
-                            void** array = location.getUniform<void*>();
-                            // If set, we setup the data needed for array on CPU side
-                            if (value && array)
+                            CUDAComputeUtil::Array array = { nullptr, 0 };
+                            auto resource = CUDAResource::getCUDAResource(value);
+                            if (resource)
                             {
-                                // TODO(JS): For now we'll just assume a pointer...
-                                *array = CUDAResource::getCUDAData(value);
+                                array.data = resource->m_cudaMemory;
+                                array.count = value->m_elementCount;
                             }
+
+                            location.setUniform(&array, sizeof(array));
                         }
                         break;
                     }
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
index ea58b6343..f739ade91 100644
--- a/tools/render-test/cuda/cuda-compute-util.h
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -21,6 +21,11 @@ struct CUDAComputeUtil
         void* data;
         size_t count;
     };
+    struct Array
+    {
+        void* data;
+        size_t count;
+    };
 
     struct Context
     {
diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp
index 050a6d2c8..16f2d78d1 100644
--- a/tools/render-test/render-test-main.cpp
+++ b/tools/render-test/render-test-main.cpp
@@ -600,9 +600,17 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe
 
 #if RENDER_TEST_CUDA
 
+        const uint64_t startTicks = ProcessUtil::getClockTick();
+
         CUDAComputeUtil::Context context;
         SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout, context));
 
+        if (gOptions.performanceProfile)
+        {
+            const uint64_t endTicks = ProcessUtil::getClockTick();
+            _outputProfileTime(startTicks, endTicks);
+        }
+
         if (gOptions.outputPath)
         {
             // Dump everything out that was written
diff --git a/tools/slang-test/options.h b/tools/slang-test/options.h
index 12869a945..ffad16fdc 100644
--- a/tools/slang-test/options.h
+++ b/tools/slang-test/options.h
@@ -86,7 +86,7 @@ struct Options
     // OpenGL is disabled for now
     // CPU is disabled by default
     // CUDA is disabled by default
-    Slang::RenderApiFlags synthesizedTestApis = Slang::RenderApiFlag::AllOf & ~(Slang::RenderApiFlag::Vulkan | Slang::RenderApiFlag::OpenGl | Slang::RenderApiFlag::CPU | Slang::RenderApiFlag::CUDA);
+    Slang::RenderApiFlags synthesizedTestApis = Slang::RenderApiFlag::AllOf & ~(Slang::RenderApiFlag::Vulkan | Slang::RenderApiFlag::OpenGl | Slang::RenderApiFlag::CPU); 
 
     // The adapter to use. If empty will match first found adapter.
     Slang::String adapter;
diff --git a/tools/slang-test/slang-test-main.cpp b/tools/slang-test/slang-test-main.cpp
index ac073bcee..a6ad0cff3 100644
--- a/tools/slang-test/slang-test-main.cpp
+++ b/tools/slang-test/slang-test-main.cpp
@@ -2514,22 +2514,61 @@ bool testPassesCategoryMask(
 static void _calcSynthesizedTests(TestContext* context, RenderApiType synthRenderApiType, const List<TestDetails>& srcTests, List<TestDetails>& ioSynthTests)
 {
     // Add the explicit parameter
-    for (const auto& testDetails: srcTests)
+    for (const auto& srcTest: srcTests)
     {
-        const auto& requirements = testDetails.requirements;
+        const auto& requirements = srcTest.requirements;
 
         // Render tests use renderApis...
         // If it's an explicit test, we don't synth from it now
 
-        // TODO(JS): Arguably we should synthesize from explicit tests. In principal we can remove the explicit api apply another
-        // although that may not always work.
-        if (requirements.usedRenderApiFlags == 0 ||
-            requirements.explicitRenderApi != RenderApiType::Unknown)
+        // In the case of CUDA, we can only synth from a CPU source
+        if (synthRenderApiType == RenderApiType::CUDA)
         {
-            continue;
+            if (requirements.explicitRenderApi != RenderApiType::CPU)
+            {
+                continue;
+            }
+
+            // If the source language is defined, and it's
+
+            const Index index = srcTest.options.args.indexOf("-source-language");
+            if (index >= 0)
+            {
+                //
+                const auto& language = srcTest.options.args[index + 1];
+                SlangSourceLanguage sourceLanguage = DownstreamCompiler::getSourceLanguageFromName(language.getUnownedSlice());
+
+                bool isCrossCompile = true;
+
+                switch (sourceLanguage)
+                {
+                    case SLANG_SOURCE_LANGUAGE_GLSL:
+                    case SLANG_SOURCE_LANGUAGE_C:
+                    case SLANG_SOURCE_LANGUAGE_CPP:
+                    {
+                        isCrossCompile = false;
+                    }
+                    default: break;
+                }
+
+                if (!isCrossCompile)
+                {
+                    continue;
+                }
+            }
+        }
+        else
+        {
+            // TODO(JS): Arguably we should synthesize from explicit tests. In principal we can remove the explicit api apply another
+            // although that may not always work.
+            if (requirements.usedRenderApiFlags == 0 ||
+                requirements.explicitRenderApi != RenderApiType::Unknown)
+            {
+                continue;
+            }
         }
 
-        TestDetails synthTestDetails(testDetails.options);
+        TestDetails synthTestDetails(srcTest.options);
         TestOptions& synthOptions = synthTestDetails.options;
 
         // Mark as synthesized
@@ -2544,8 +2583,16 @@ static void _calcSynthesizedTests(TestContext* context, RenderApiType synthRende
         // If the target is vulkan remove the -hlsl option
         if (synthRenderApiType == RenderApiType::Vulkan)
         {
-            Index index = synthOptions.args.indexOf("-hlsl");
-            if (index != Index(-1))
+            const Index index = synthOptions.args.indexOf("-hlsl");
+            if (index >= 0)
+            {
+                synthOptions.args.removeAt(index);
+            }
+        }
+        else if (synthRenderApiType == RenderApiType::CUDA)
+        {
+            const Index index = synthOptions.args.indexOf("-cpu");
+            if (index >= 0)
             {
                 synthOptions.args.removeAt(index);
             }