diff options
| -rw-r--r-- | docs/cuda-target.md | 21 | ||||
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 17 | ||||
| -rw-r--r-- | source/slang/slang-emit.cpp | 1 | ||||
| -rw-r--r-- | source/slang/slang-type-layout.cpp | 149 | ||||
| -rw-r--r-- | source/slang/slang-type-layout.h | 12 | ||||
| -rw-r--r-- | tests/compute/buffer-layout.slang.4.expected.txt | 4 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.cpp | 12 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.h | 5 | ||||
| -rw-r--r-- | tools/render-test/render-test-main.cpp | 8 | ||||
| -rw-r--r-- | tools/slang-test/options.h | 2 | ||||
| -rw-r--r-- | tools/slang-test/slang-test-main.cpp | 67 |
11 files changed, 261 insertions, 37 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md index 01803c145..9c82b1dc9 100644 --- a/docs/cuda-target.md +++ b/docs/cuda-target.md @@ -97,7 +97,26 @@ The UniformState and UniformEntryPointParams struct typically vary by shader. Un ## Unsized arrays -WIP: Not implemented yet. +Unsized arrays can be used, which are indicated by an array with no size as in `[]`. For example + +``` + RWStructuredBuffer<int> arrayOfArrays[]; +``` + +With normal 'sized' arrays, the elements are just stored contiguously within wherever they are defined. With an unsized array they map to `Array<T>` which is... + +``` + T* data; + size_t count; +``` + +Note that there is no method in the shader source to get the `count`, even though on the CUDA target it is stored and easily available. This is because of the behavior on GPU targets + +* That the count has to be stored elsewhere (unlike with CUDA) +* On some GPU targets there is no bounds checking - accessing outside the bound values can cause *undefined behavior* +* The elements may be laid out *contiguously* on GPU + +In practice this means if you want to access the `count` in shader code it will need to be passed by another mechanism - such as within a constant buffer. It is possible in the future support may be added to allow direct access of `count` work across targets transparently. ## Prelude diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 63388c7f3..7e6e5957d 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -38,6 +38,18 @@ struct FixedArray T m_data[SIZE]; }; +// An array that has no specified size, becomes a 'Array'. This stores the size so it can potentially +// do bounds checking. +template <typename T> +struct Array +{ + SLANG_CUDA_CALL const T& operator[](size_t index) const { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; } + SLANG_CUDA_CALL T& operator[](size_t index) { SLANG_CUDA_BOUND_CHECK(index, count); return data[index]; } + + T* data; + size_t count; +}; + // Typically defined in cuda.h, but we can't ship/rely on that, so just define here typedef unsigned long long CUtexObject; typedef unsigned long long CUsurfObject; @@ -49,6 +61,11 @@ typedef unsigned long long CUsurfObject; struct SamplerStateUnused; typedef SamplerStateUnused* SamplerState; + +// TODO(JS): Not clear yet if this can be handled on CUDA, by just ignoring. +// For now, just map to the index type. +typedef size_t NonUniformResourceIndex; + // Code generator will generate the specific type template <typename T, int ROWS, int COLS> struct Matrix; diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index 66e5714c3..2a216994b 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -518,6 +518,7 @@ String emitEntryPointSourceFromIR( case SourceStyle::CPP: case SourceStyle::C: + case SourceStyle::CUDA: linkingAndOptimizationOptions.shouldLegalizeExistentialAndResourceTypes = false; break; } diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp index 2eec26ee6..08c789609 100644 --- a/source/slang/slang-type-layout.cpp +++ b/source/slang/slang-type-layout.cpp @@ -112,8 +112,9 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl return arrayInfo; } - SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) override + SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override { + SLANG_UNUSED(elementType); SimpleLayoutInfo vectorInfo; vectorInfo.kind = elementInfo.kind; vectorInfo.size = elementInfo.size * elementCount; @@ -121,7 +122,7 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl return vectorInfo; } - SimpleArrayLayoutInfo GetMatrixLayout(SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override + SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override { // The default behavior here is to lay out a matrix // as an array of row vectors (that is row-major). @@ -131,7 +132,7 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl // to get layouts with a different convention. // return GetArrayLayout( - GetVectorLayout(elementInfo, columnCount), + GetVectorLayout(elementType, elementInfo, columnCount), rowCount); } @@ -204,8 +205,9 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl { typedef DefaultLayoutRulesImpl Super; - SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) override + SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override { + SLANG_UNUSED(elementType); // The `std140` and `std430` rules require vectors to be aligned to the next power of // two up from their size (so a `float2` is 8-byte aligned, and a `float3` is // 16-byte aligned). @@ -224,7 +226,7 @@ struct GLSLBaseLayoutRulesImpl : DefaultLayoutRulesImpl return vectorInfo; } - SimpleArrayLayoutInfo GetArrayLayout( SimpleLayoutInfo elementInfo, LayoutSize elementCount) override + SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) override { // The size of an array must be rounded up to be a multiple of its alignment. // @@ -376,7 +378,7 @@ struct CPULayoutRulesImpl : DefaultLayoutRulesImpl // So it is actually a Array<T> on CPU which is a pointer and a size info.size = sizeof(void*) * 2; - info.alignment = sizeof(void*); + info.alignment = SLANG_ALIGN_OF(void*); return info; } @@ -398,12 +400,115 @@ struct CPULayoutRulesImpl : DefaultLayoutRulesImpl } }; -// TODO(JS): Most likely wrong. For layout for CUDA, we'll just do the default to get things up and running struct CUDALayoutRulesImpl : DefaultLayoutRulesImpl { typedef DefaultLayoutRulesImpl Super; -}; + SimpleLayoutInfo GetScalarLayout(BaseType baseType) override + { + switch (baseType) + { + case BaseType::Bool: + { + // In memory a bool is a byte. BUT when in a vector or matrix it will actually be a int32_t + return SimpleLayoutInfo(LayoutResourceKind::Uniform, sizeof(uint8_t), SLANG_ALIGN_OF(uint8_t)); + } + + default: return Super::GetScalarLayout(baseType); + } + } + + SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) override + { + SLANG_RELEASE_ASSERT(elementInfo.size.isFinite()); + auto elementSize = elementInfo.size.getFiniteValue(); + auto elementAlignment = elementInfo.alignment; + auto elementStride = RoundToAlignment(elementSize, elementAlignment); + + if (elementCount.isInfinite()) + { + // This is an unsized array, get information for element + auto info = Super::GetArrayLayout(elementInfo, LayoutSize(1)); + + // So it is actually a Array<T> on CUDA which is a pointer and a size + info.size = sizeof(void*) * 2; + info.alignment = SLANG_ALIGN_OF(void*); + return info; + } + + // An array with no elements will have zero size. + // + LayoutSize arraySize = 0; + // + // Any array with a non-zero number of elements will need + // to have space for N elements of size `elementSize`, with + // the constraints that there must be `elementStride` bytes + // between consecutive elements. + // + if (elementCount > 0) + { + // We can think of this as either allocating (N-1) + // chunks of size `elementStride` (for most of the elements) + // and then one final chunk of size `elementSize` for + // the last element, or equivalently as allocating + // N chunks of size `elementStride` and then "giving back" + // the final `elementStride - elementSize` bytes. + // + arraySize = (elementStride * (elementCount - 1)) + elementSize; + } + + SimpleArrayLayoutInfo arrayInfo; + arrayInfo.kind = elementInfo.kind; + arrayInfo.size = arraySize; + arrayInfo.alignment = elementAlignment; + arrayInfo.elementStride = elementStride; + return arrayInfo; + } + + SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) override + { + // Special case bool + if (elementType == BaseType::Bool) + { + SimpleLayoutInfo fixInfo(elementInfo); + fixInfo.size = sizeof(int32_t); + fixInfo.alignment = SLANG_ALIGN_OF(int32_t); + return GetVectorLayout(BaseType::Int, fixInfo, elementCount); + } + + SimpleLayoutInfo vectorInfo; + vectorInfo.kind = elementInfo.kind; + vectorInfo.size = elementInfo.size * elementCount; + vectorInfo.alignment = elementInfo.alignment; + + return vectorInfo; + } + + SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) override + { + // Special case bool + if (elementType == BaseType::Bool) + { + SimpleLayoutInfo fixInfo(elementInfo); + fixInfo.size = sizeof(int32_t); + fixInfo.alignment = SLANG_ALIGN_OF(int32_t); + return GetMatrixLayout(BaseType::Int, fixInfo, rowCount, columnCount); + } + + return Super::GetMatrixLayout(elementType, elementInfo, rowCount, columnCount); + } + + UniformLayoutInfo BeginStructLayout() override + { + return Super::BeginStructLayout(); + } + + void EndStructLayout(UniformLayoutInfo* ioStructInfo) override + { + // Conform to CUDA/C/C++ size is adjusted to the largest alignment + ioStructInfo->size = RoundToAlignment(ioStructInfo->size, ioStructInfo->alignment); + } +}; struct HLSLStructuredBufferLayoutRulesImpl : DefaultLayoutRulesImpl { @@ -436,8 +541,9 @@ struct DefaultVaryingLayoutRulesImpl : DefaultLayoutRulesImpl 1); } - SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo, size_t) override + SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo, size_t) override { + SLANG_UNUSED(elementType); // Vectors take up one slot by default // // TODO: some platforms may decide that vectors of `double` need @@ -479,8 +585,9 @@ struct GLSLSpecializationConstantLayoutRulesImpl : DefaultLayoutRulesImpl 1); } - SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo, size_t elementCount) override + SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo, size_t elementCount) override { + SLANG_UNUSED(elementType); // GLSL doesn't support vectors of specialization constants, // but we will assume that, if supported, they would use one slot per element. return SimpleLayoutInfo( @@ -3052,7 +3159,13 @@ static TypeLayoutResult _createTypeLayout( context, elementType); - auto info = rules->GetVectorLayout(element.info, elementCount); + BaseType elementBaseType = BaseType::Void; + if (auto elementBasicType = as<BasicExpressionType>(elementType)) + { + elementBaseType = elementBasicType->baseType; + } + + auto info = rules->GetVectorLayout(elementBaseType, element.info, elementCount); RefPtr<VectorTypeLayout> typeLayout = new VectorTypeLayout(); typeLayout->type = type; @@ -3078,6 +3191,12 @@ static TypeLayoutResult _createTypeLayout( auto elementTypeLayout = elementResult.layout; auto elementInfo = elementResult.info; + BaseType elementBaseType = BaseType::Void; + if (auto elementBasicType = as<BasicExpressionType>(elementType)) + { + elementBaseType = elementBasicType->baseType; + } + // The `GetMatrixLayout` implementation in the layout rules // currently defaults to assuming row-major layout, // so if we want column-major layout we achieve it here by @@ -3092,6 +3211,7 @@ static TypeLayoutResult _createTypeLayout( layoutMinorCount = tmp; } auto info = rules->GetMatrixLayout( + elementBaseType, elementInfo, layoutMajorCount, layoutMinorCount); @@ -3100,6 +3220,7 @@ static TypeLayoutResult _createTypeLayout( RefPtr<VectorTypeLayout> rowTypeLayout = new VectorTypeLayout(); auto rowInfo = rules->GetVectorLayout( + elementBaseType, elementInfo, colCount); @@ -3680,7 +3801,7 @@ RefPtr<TypeLayout> getSimpleVaryingParameterTypeLayout( { auto varyingRuleSet = varyingRules[rr]; auto elementInfo = varyingRuleSet->GetScalarLayout(elementBaseType); - auto info = varyingRuleSet->GetVectorLayout(elementInfo, elementCount); + auto info = varyingRuleSet->GetVectorLayout(elementBaseType, elementInfo, elementCount); typeLayout->addResourceUsage(info.kind, info.size); } @@ -3735,14 +3856,14 @@ RefPtr<TypeLayout> getSimpleVaryingParameterTypeLayout( auto varyingRuleSet = varyingRules[rr]; auto elementInfo = varyingRuleSet->GetScalarLayout(elementBaseType); - auto info = varyingRuleSet->GetMatrixLayout(elementInfo, layoutMajorCount, layoutMinorCount); + auto info = varyingRuleSet->GetMatrixLayout(elementBaseType, elementInfo, layoutMajorCount, layoutMinorCount); typeLayout->addResourceUsage(info.kind, info.size); if(context.matrixLayoutMode == kMatrixLayoutMode_RowMajor) { // For row-major matrices only, we can compute an effective // resource usage for the row type. - auto rowInfo = varyingRuleSet->GetVectorLayout(elementInfo, colCount); + auto rowInfo = varyingRuleSet->GetVectorLayout(elementBaseType, elementInfo, colCount); rowTypeLayout->addResourceUsage(rowInfo.kind, rowInfo.size); } } diff --git a/source/slang/slang-type-layout.h b/source/slang/slang-type-layout.h index afca06ce1..2cc6f338c 100644 --- a/source/slang/slang-type-layout.h +++ b/source/slang/slang-type-layout.h @@ -814,8 +814,8 @@ struct SimpleLayoutRulesImpl virtual SimpleArrayLayoutInfo GetArrayLayout(SimpleLayoutInfo elementInfo, LayoutSize elementCount) = 0; // Get layout for a vector or matrix type - virtual SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) = 0; - virtual SimpleArrayLayoutInfo GetMatrixLayout(SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) = 0; + virtual SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) = 0; + virtual SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) = 0; // Begin doing layout on a `struct` type virtual UniformLayoutInfo BeginStructLayout() = 0; @@ -851,14 +851,14 @@ struct LayoutRulesImpl return simpleRules->GetArrayLayout(elementInfo, elementCount); } - SimpleLayoutInfo GetVectorLayout(SimpleLayoutInfo elementInfo, size_t elementCount) + SimpleLayoutInfo GetVectorLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t elementCount) { - return simpleRules->GetVectorLayout(elementInfo, elementCount); + return simpleRules->GetVectorLayout(elementType, elementInfo, elementCount); } - SimpleArrayLayoutInfo GetMatrixLayout(SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) + SimpleArrayLayoutInfo GetMatrixLayout(BaseType elementType, SimpleLayoutInfo elementInfo, size_t rowCount, size_t columnCount) { - return simpleRules->GetMatrixLayout(elementInfo, rowCount, columnCount); + return simpleRules->GetMatrixLayout(elementType, elementInfo, rowCount, columnCount); } UniformLayoutInfo BeginStructLayout() diff --git a/tests/compute/buffer-layout.slang.4.expected.txt b/tests/compute/buffer-layout.slang.4.expected.txt new file mode 100644 index 000000000..4fcded8c6 --- /dev/null +++ b/tests/compute/buffer-layout.slang.4.expected.txt @@ -0,0 +1,4 @@ +13080308 +23080308 +33080308 +43080308 diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp index aa82d8d70..a50295063 100644 --- a/tools/render-test/cuda/cuda-compute-util.cpp +++ b/tools/render-test/cuda/cuda-compute-util.cpp @@ -572,13 +572,15 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp auto elementCount = int(typeLayout->getElementCount()); if (elementCount == 0) { - void** array = location.getUniform<void*>(); - // If set, we setup the data needed for array on CPU side - if (value && array) + CUDAComputeUtil::Array array = { nullptr, 0 }; + auto resource = CUDAResource::getCUDAResource(value); + if (resource) { - // TODO(JS): For now we'll just assume a pointer... - *array = CUDAResource::getCUDAData(value); + array.data = resource->m_cudaMemory; + array.count = value->m_elementCount; } + + location.setUniform(&array, sizeof(array)); } break; } diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h index ea58b6343..f739ade91 100644 --- a/tools/render-test/cuda/cuda-compute-util.h +++ b/tools/render-test/cuda/cuda-compute-util.h @@ -21,6 +21,11 @@ struct CUDAComputeUtil void* data; size_t count; }; + struct Array + { + void* data; + size_t count; + }; struct Context { diff --git a/tools/render-test/render-test-main.cpp b/tools/render-test/render-test-main.cpp index 050a6d2c8..16f2d78d1 100644 --- a/tools/render-test/render-test-main.cpp +++ b/tools/render-test/render-test-main.cpp @@ -600,9 +600,17 @@ SLANG_TEST_TOOL_API SlangResult innerMain(Slang::StdWriters* stdWriters, SlangSe #if RENDER_TEST_CUDA + const uint64_t startTicks = ProcessUtil::getClockTick(); + CUDAComputeUtil::Context context; SLANG_RETURN_ON_FAIL(CUDAComputeUtil::execute(compilationAndLayout, context)); + if (gOptions.performanceProfile) + { + const uint64_t endTicks = ProcessUtil::getClockTick(); + _outputProfileTime(startTicks, endTicks); + } + if (gOptions.outputPath) { // Dump everything out that was written diff --git a/tools/slang-test/options.h b/tools/slang-test/options.h index 12869a945..ffad16fdc 100644 --- a/tools/slang-test/options.h +++ b/tools/slang-test/options.h @@ -86,7 +86,7 @@ struct Options // OpenGL is disabled for now // CPU is disabled by default // CUDA is disabled by default - Slang::RenderApiFlags synthesizedTestApis = Slang::RenderApiFlag::AllOf & ~(Slang::RenderApiFlag::Vulkan | Slang::RenderApiFlag::OpenGl | Slang::RenderApiFlag::CPU | Slang::RenderApiFlag::CUDA); + Slang::RenderApiFlags synthesizedTestApis = Slang::RenderApiFlag::AllOf & ~(Slang::RenderApiFlag::Vulkan | Slang::RenderApiFlag::OpenGl | Slang::RenderApiFlag::CPU); // The adapter to use. If empty will match first found adapter. Slang::String adapter; diff --git a/tools/slang-test/slang-test-main.cpp b/tools/slang-test/slang-test-main.cpp index ac073bcee..a6ad0cff3 100644 --- a/tools/slang-test/slang-test-main.cpp +++ b/tools/slang-test/slang-test-main.cpp @@ -2514,22 +2514,61 @@ bool testPassesCategoryMask( static void _calcSynthesizedTests(TestContext* context, RenderApiType synthRenderApiType, const List<TestDetails>& srcTests, List<TestDetails>& ioSynthTests) { // Add the explicit parameter - for (const auto& testDetails: srcTests) + for (const auto& srcTest: srcTests) { - const auto& requirements = testDetails.requirements; + const auto& requirements = srcTest.requirements; // Render tests use renderApis... // If it's an explicit test, we don't synth from it now - // TODO(JS): Arguably we should synthesize from explicit tests. In principal we can remove the explicit api apply another - // although that may not always work. - if (requirements.usedRenderApiFlags == 0 || - requirements.explicitRenderApi != RenderApiType::Unknown) + // In the case of CUDA, we can only synth from a CPU source + if (synthRenderApiType == RenderApiType::CUDA) { - continue; + if (requirements.explicitRenderApi != RenderApiType::CPU) + { + continue; + } + + // If the source language is defined, and it's + + const Index index = srcTest.options.args.indexOf("-source-language"); + if (index >= 0) + { + // + const auto& language = srcTest.options.args[index + 1]; + SlangSourceLanguage sourceLanguage = DownstreamCompiler::getSourceLanguageFromName(language.getUnownedSlice()); + + bool isCrossCompile = true; + + switch (sourceLanguage) + { + case SLANG_SOURCE_LANGUAGE_GLSL: + case SLANG_SOURCE_LANGUAGE_C: + case SLANG_SOURCE_LANGUAGE_CPP: + { + isCrossCompile = false; + } + default: break; + } + + if (!isCrossCompile) + { + continue; + } + } + } + else + { + // TODO(JS): Arguably we should synthesize from explicit tests. In principal we can remove the explicit api apply another + // although that may not always work. + if (requirements.usedRenderApiFlags == 0 || + requirements.explicitRenderApi != RenderApiType::Unknown) + { + continue; + } } - TestDetails synthTestDetails(testDetails.options); + TestDetails synthTestDetails(srcTest.options); TestOptions& synthOptions = synthTestDetails.options; // Mark as synthesized @@ -2544,8 +2583,16 @@ static void _calcSynthesizedTests(TestContext* context, RenderApiType synthRende // If the target is vulkan remove the -hlsl option if (synthRenderApiType == RenderApiType::Vulkan) { - Index index = synthOptions.args.indexOf("-hlsl"); - if (index != Index(-1)) + const Index index = synthOptions.args.indexOf("-hlsl"); + if (index >= 0) + { + synthOptions.args.removeAt(index); + } + } + else if (synthRenderApiType == RenderApiType::CUDA) + { + const Index index = synthOptions.args.indexOf("-cpu"); + if (index >= 0) { synthOptions.args.removeAt(index); } |
