diff options
| -rw-r--r-- | prelude/slang-cpp-types.h | 17 | ||||
| -rw-r--r-- | source/slang/core.meta.slang | 45 | ||||
| -rw-r--r-- | source/slang/core.meta.slang.h | 47 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 5 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang.h | 7 | ||||
| -rw-r--r-- | source/slang/slang-ir-type-set.cpp | 92 | ||||
| -rw-r--r-- | source/slang/slang-ir-type-set.h | 5 | ||||
| -rw-r--r-- | tests/compute/texture-simple.slang | 31 | ||||
| -rw-r--r-- | tests/compute/texture-simple.slang.expected.txt | 4 | ||||
| -rw-r--r-- | tools/render-test/cpu-compute-util.cpp | 110 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.cpp | 165 | ||||
| -rw-r--r-- | tools/render-test/cuda/cuda-compute-util.h | 10 |
12 files changed, 371 insertions, 167 deletions
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h index 67db607f6..2238727c5 100644 --- a/prelude/slang-cpp-types.h +++ b/prelude/slang-cpp-types.h @@ -228,6 +228,23 @@ struct SamplerComparisonState // Texture +struct ITexture1D +{ + virtual void Load(const int2& v, void* out) = 0; + virtual void Sample(SamplerState samplerState, float loc, void* out) = 0; + virtual void SampleLevel(SamplerState samplerState, float loc, float level, void* out) = 0; +}; + +template <typename T> +struct Texture1D +{ + T Load(const int2& v) const { T out; texture->Load(v, &out); return out; } + T Sample(SamplerState samplerState, float v) const { T out; texture->Sample(samplerState, v, &out); return out; } + T SampleLevel(SamplerState samplerState, float v, float level) { T out; texture->SampleLevel(samplerState, v, level, &out); return out; } + + ITexture1D* texture; +}; + struct ITexture2D { virtual void Load(const int3& v, void* out) = 0; diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index 6efb383fa..450cc4512 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -897,19 +897,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0"; - if (kBaseTextureTypes[tt].coordCount == 1) - { - sb << ", $2"; - } - else + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) { - sb << ", ($2)." << char(i + 'x'); + sb << '.' << char(i + 'x'); } } sb << ")\")\n"; } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; + } sb << "T Sample(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n"; @@ -1028,7 +1029,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n"; } - + // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on + // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy. + // I could just assume 0 for elements not supplied, and ignore z. For now will just leave sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n"; sb << "T SampleGrad(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; @@ -1053,23 +1056,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) // CUDA if (!isArray) { - sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + if( baseShape != TextureFlavor::Shape::ShapeCube ) { - sb << ", $2"; - if (kBaseTextureTypes[tt].coordCount > 1) + sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - sb << '.' << char(i + 'x'); + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) + { + sb << '.' << char(i + 'x'); + } } + sb << ", $3)\")\n"; + } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; } - sb << ", $3)\")\n"; } sb << "T SampleLevel(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; sb << "float level);\n"; - if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n"; @@ -1145,6 +1154,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) EMIT_LINE_DIRECTIVE(); sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n"; + if (kBaseTextureTypes[tt].coordCount == 2) + { + // Gather only works on 2D in CUDA + // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4." + sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n"; + } sb << outputType << " Gather" << componentName << "(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n"; diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h index 5f185ca8f..cca8f2e51 100644 --- a/source/slang/core.meta.slang.h +++ b/source/slang/core.meta.slang.h @@ -918,19 +918,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0"; - if (kBaseTextureTypes[tt].coordCount == 1) - { - sb << ", $2"; - } - else + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) { - sb << ", ($2)." << char(i + 'x'); + sb << '.' << char(i + 'x'); } } sb << ")\")\n"; } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; + } sb << "T Sample(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n"; @@ -1049,7 +1050,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n"; } - + // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on + // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy. + // I could just assume 0 for elements not supplied, and ignore z. For now will just leave sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n"; sb << "T SampleGrad(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; @@ -1074,23 +1077,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) // CUDA if (!isArray) { - sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + if( baseShape != TextureFlavor::Shape::ShapeCube ) { - sb << ", $2"; - if (kBaseTextureTypes[tt].coordCount > 1) + sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - sb << '.' << char(i + 'x'); + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) + { + sb << '.' << char(i + 'x'); + } } + sb << ", $3)\")\n"; + } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; } - sb << ", $3)\")\n"; } sb << "T SampleLevel(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; sb << "float level);\n"; - if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n"; @@ -1166,6 +1175,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) EMIT_LINE_DIRECTIVE(); sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n"; + if (kBaseTextureTypes[tt].coordCount == 2) + { + // Gather only works on 2D in CUDA + // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4." + sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n"; + } sb << outputType << " Gather" << componentName << "(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n"; @@ -1299,7 +1314,7 @@ for (auto op : binaryOps) sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n"; } } -SLANG_RAW("#line 1281 \"core.meta.slang\"") +SLANG_RAW("#line 1296 \"core.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("// Specialized function\n") diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index f8ae340bc..f7707cc6d 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -347,8 +347,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> bool all(matrix<T,N,M> x) // Barrier for writes to all memory spaces (HLSL SM 5.0) __target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()") -// TODO(JS): Doesn't seem to be weaker form of sync, so use this? -__target_intrinsic(cuda, "__syncthreads()") +__target_intrinsic(cuda, "__threadfence()") void AllMemoryBarrier(); // Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0) @@ -648,6 +647,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int> T determinant(matrix<T,N, // Barrier for device memory __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()") +__target_intrinsic(cuda, "__threadfence()") void DeviceMemoryBarrier(); __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()") @@ -814,6 +814,7 @@ float2 GetRenderTargetSamplePosition(int Index); // Group memory barrier __target_intrinsic(glsl, "groupMemoryBarrier") +__target_intrinsic(cuda, "__threadfence_block") void GroupMemoryBarrier(); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index 215d18670..c0b875df5 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -396,8 +396,7 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> bool all(matri SLANG_RAW("\n") SLANG_RAW("// Barrier for writes to all memory spaces (HLSL SM 5.0)\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n") -SLANG_RAW("// TODO(JS): Doesn't seem to be weaker form of sync, so use this?\n") -SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n") SLANG_RAW("void AllMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)\n") @@ -724,6 +723,7 @@ SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> T determinant( SLANG_RAW("\n") SLANG_RAW("// Barrier for device memory\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n") SLANG_RAW("void DeviceMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n") @@ -890,6 +890,7 @@ SLANG_RAW("float2 GetRenderTargetSamplePosition(int Index);\n") SLANG_RAW("\n") SLANG_RAW("// Group memory barrier\n") SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__threadfence_block\")\n") SLANG_RAW("void GroupMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("\n") @@ -1641,7 +1642,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1568 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1569 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") diff --git a/source/slang/slang-ir-type-set.cpp b/source/slang/slang-ir-type-set.cpp index a4ebf8242..e5271698c 100644 --- a/source/slang/slang-ir-type-set.cpp +++ b/source/slang/slang-ir-type-set.cpp @@ -115,50 +115,74 @@ IRInst* IRTypeSet::cloneInst(IRInst* inst) clone = m_builder.getStringValue(stringLit->getStringSlice()); break; } - default: + case kIROp_VectorType: { - if (IRBasicType::isaImpl(inst->op)) + auto vecType = static_cast<IRVectorType*>(inst); + const Index elementCount = Index(GetIntVal(vecType->getElementCount())); + + if (elementCount <= 1) { - clone = m_builder.getType(inst->op); + clone = cloneType(vecType->getElementType()); } - else + break; + } + case kIROp_MatrixType: + { + auto matType = static_cast<IRMatrixType*>(inst); + const Index columnCount = Index(GetIntVal(matType->getColumnCount())); + const Index rowCount = Index(GetIntVal(matType->getRowCount())); + + if (columnCount <= 1 && rowCount <= 1) { - IRType* irType = dynamicCast<IRType>(inst); - if (irType) - { - auto clonedType = cloneType(inst->getFullType()); - Index operandCount = Index(inst->getOperandCount()); + clone = cloneType(matType->getElementType()); + } + break; + } + default: break; + } + + if (!clone) + { + if (IRBasicType::isaImpl(inst->op)) + { + clone = m_builder.getType(inst->op); + } + else + { + IRType* irType = dynamicCast<IRType>(inst); + if (irType) + { + auto clonedType = cloneType(inst->getFullType()); + Index operandCount = Index(inst->getOperandCount()); - List<IRInst*> cloneOperands; - cloneOperands.setCount(operandCount); + List<IRInst*> cloneOperands; + cloneOperands.setCount(operandCount); - for (Index i = 0; i < operandCount; ++i) - { - cloneOperands[i] = cloneInst(inst->getOperand(i)); - } + for (Index i = 0; i < operandCount; ++i) + { + cloneOperands[i] = cloneInst(inst->getOperand(i)); + } - //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer()); + //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer()); - UInt operandCounts[1] = { UInt(operandCount) }; - IRInst*const* listOperands[1] = { cloneOperands.getBuffer() }; + UInt operandCounts[1] = { UInt(operandCount) }; + IRInst*const* listOperands[1] = { cloneOperands.getBuffer() }; - clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands); - } - else + clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands); + } + else + { + // This cloning style only works on insts that are not unique + auto clonedType = cloneType(inst->getFullType()); + + Index operandCount = Index(inst->getOperandCount()); + clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr); + for (Index i = 0; i < operandCount; ++i) { - // This cloning style only works on insts that are not unique - auto clonedType = cloneType(inst->getFullType()); - - Index operandCount = Index(inst->getOperandCount()); - clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr); - for (Index i = 0; i < operandCount; ++i) - { - auto cloneOperand = cloneInst(inst->getOperand(i)); - clone->getOperands()[i].init(clone, cloneOperand); - } + auto cloneOperand = cloneInst(inst->getOperand(i)); + clone->getOperands()[i].init(clone, cloneOperand); } } - break; } } @@ -226,6 +250,10 @@ void IRTypeSet::getTypes(Kind kind, List<IRType*>& outTypes) const IRType* IRTypeSet::addVectorType(IRType* inElementType, int colsCount) { IRType* elementType = cloneType(inElementType); + if (colsCount == 1) + { + return elementType; + } return m_builder.getVectorType(elementType, m_builder.getIntValue(m_builder.getIntType(), colsCount)); } diff --git a/source/slang/slang-ir-type-set.h b/source/slang/slang-ir-type-set.h index 09abdf2ad..958d71cf1 100644 --- a/source/slang/slang-ir-type-set.h +++ b/source/slang/slang-ir-type-set.h @@ -34,7 +34,10 @@ works, but probably needs to be handled in a better way. The better way may invo enabled in other code generation and making de-duping possible in emit code. Note that one pro for this approach is that it does not alter the source module. That as it stands it's not necessary -for the source module to be immutable, because it is created for emitting and then discarded. +for the source module to be immutable, because it is created for emitting and then discarded. + +NOTE! That Vector<X, 1> or Matrix<X, 1, 1> will be turned into the type X. + */ class IRTypeSet { diff --git a/tests/compute/texture-simple.slang b/tests/compute/texture-simple.slang new file mode 100644 index 000000000..040af2784 --- /dev/null +++ b/tests/compute/texture-simple.slang @@ -0,0 +1,31 @@ +//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil +// TODO(JS): Doesn't work on vk currently +//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute + +//TEST_INPUT: Texture1D(size=4, content = one):name t1D +Texture1D<float> t1D; +//TEST_INPUT: Texture2D(size=4, content = one):name t2D +Texture2D<float> t2D; + +//TEST_INPUT: Sampler:name samplerState +SamplerState samplerState; + +//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<float> outputBuffer; + +[numthreads(4, 4, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + int idx = dispatchThreadID.x; + float u = idx * (1.0f / 4); + + float val = 0.0f; + val += t1D.SampleLevel(samplerState, u, 0); + val += t2D.SampleLevel(samplerState, float2(u, u), 0); + + outputBuffer[idx] = val; +} diff --git a/tests/compute/texture-simple.slang.expected.txt b/tests/compute/texture-simple.slang.expected.txt new file mode 100644 index 000000000..f5cf6fb10 --- /dev/null +++ b/tests/compute/texture-simple.slang.expected.txt @@ -0,0 +1,4 @@ +40000000 +40000000 +40000000 +40000000 diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp index 2bb0baf88..d0907482c 100644 --- a/tools/render-test/cpu-compute-util.cpp +++ b/tools/render-test/cpu-compute-util.cpp @@ -16,48 +16,106 @@ namespace renderer_test { using namespace Slang; template <int COUNT> -struct OneTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D +struct ValueTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D { - void setOne(void* out) + void set(void* out) { float* dst = (float*)out; for (int i = 0; i < COUNT; ++i) { - dst[i] = 1.0f; + dst[i] = m_value; } } virtual void Load(const CPPPrelude::int3& v, void* out) SLANG_OVERRIDE { - setOne(out); + set(out); } virtual void Sample(CPPPrelude::SamplerState samplerState, const CPPPrelude::float2& loc, void* out) SLANG_OVERRIDE { - setOne(out); + set(out); } virtual void SampleLevel(CPPPrelude::SamplerState samplerState, const CPPPrelude::float2& loc, float level, void* out) SLANG_OVERRIDE { - setOne(out); + set(out); } - OneTexture2D() + ValueTexture2D(float value): + m_value(value) { m_interface = static_cast<CPPPrelude::ITexture2D*>(this); } + + float m_value; }; -static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) +template <int COUNT> +struct ValueTexture1D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture1D { - switch (elemCount) + void set(void* out) + { + float* dst = (float*)out; + for (int i = 0; i < COUNT; ++i) + { + dst[i] = m_value; + } + } + + virtual void Load(const CPPPrelude::int2& v, void* out) SLANG_OVERRIDE { - case 1: return new OneTexture2D<1>(); - case 2: return new OneTexture2D<2>(); - case 3: return new OneTexture2D<3>(); - case 4: return new OneTexture2D<4>(); - default: return nullptr; + set(out); } + virtual void Sample(CPPPrelude::SamplerState samplerState, float loc, void* out) SLANG_OVERRIDE + { + set(out); + } + virtual void SampleLevel(CPPPrelude::SamplerState samplerState, float loc, float level, void* out) SLANG_OVERRIDE + { + set(out); + } + + ValueTexture1D(float value) : + m_value(value) + { + m_interface = static_cast<CPPPrelude::ITexture1D*>(this); + } + + float m_value; +}; + +static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape baseShape, int elemCount, float value) +{ + switch (baseShape) + { + case SLANG_TEXTURE_1D: + { + switch (elemCount) + { + case 1: return new ValueTexture1D<1>(value); + case 2: return new ValueTexture1D<2>(value); + case 3: return new ValueTexture1D<3>(value); + case 4: return new ValueTexture1D<4>(value); + default: break; + } + break; + } + case SLANG_TEXTURE_2D: + { + switch (elemCount) + { + case 1: return new ValueTexture2D<1>(value); + case 2: return new ValueTexture2D<2>(value); + case 3: return new ValueTexture2D<3>(value); + case 4: return new ValueTexture2D<4>(value); + default: break; + } + } + default: break; + } + return nullptr; } + /* static */SlangResult CPUComputeUtil::calcBindings(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext) { auto request = compilationAndLayout.output.request; @@ -109,13 +167,16 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) //auto access = type->getResourceAccess(); - switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) + auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK; + switch (baseShape) { + case SLANG_TEXTURE_1D: case SLANG_TEXTURE_2D: { SLANG_ASSERT(value->m_userIndex >= 0); auto& srcEntry = layout.entries[value->m_userIndex]; + // TODO(JS): // We should use the srcEntry to determine what data to store in the texture, // it's dimensions etc. For now we just support it being 1. @@ -128,12 +189,23 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount) count = int(typeReflection->getElementCount()); } - // TODO(JS): Should use the input setup to work how to create this texture - // Store the target specific value - value->m_target = _newOneTexture2D(count); + switch (srcEntry.textureDesc.content) + { + case InputTextureContent::One: + { + value->m_target = _newValueTexture(baseShape, count, 1.0f); + break; + } + case InputTextureContent::Zero: + { + value->m_target = _newValueTexture(baseShape, count, 0.0f); + break; + } + default: break; + } break; } - case SLANG_TEXTURE_1D: + case SLANG_TEXTURE_3D: case SLANG_TEXTURE_CUBE: case SLANG_TEXTURE_BUFFER: diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp index c6862d2d3..f471c2961 100644 --- a/tools/render-test/cuda/cuda-compute-util.cpp +++ b/tools/render-test/cuda/cuda-compute-util.cpp @@ -9,6 +9,7 @@ #include "../bind-location.h" #include <cuda.h> + #include <cuda_runtime_api.h> namespace renderer_test { @@ -33,14 +34,11 @@ public: typedef RefObject Super; /// Dtor - CUDAResource(): m_cudaMemory(nullptr) {} - CUDAResource(void* cudaMemory): m_cudaMemory(cudaMemory) {} - ~CUDAResource() { if (m_cudaMemory) { - SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory)); + SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory)); } } @@ -49,13 +47,13 @@ public: return value ? dynamic_cast<CUDAResource*>(value->m_target.Ptr()) : nullptr; } /// Helper function to get the cuda memory pointer when given a value - static void* getCUDAData(BindSet::Value* value) + static CUdeviceptr getCUDAData(BindSet::Value* value) { auto resource = getCUDAResource(value); - return resource ? resource->m_cudaMemory : nullptr; + return resource ? resource->m_cudaMemory : CUdeviceptr(); } - void* m_cudaMemory; + CUdeviceptr m_cudaMemory = CUdeviceptr(); }; class CUDATextureResource : public RefObject @@ -63,23 +61,12 @@ class CUDATextureResource : public RefObject public: typedef RefObject Super; - CUDATextureResource() {} - CUDATextureResource(CUtexObject cudaTexObj, CUdeviceptr cudaMemory, CUarray cudaArray): - m_cudaTexObj(cudaTexObj), - m_cudaMemory(cudaMemory), - m_cudaArray(cudaArray) - { - } ~CUDATextureResource() { if (m_cudaTexObj) { SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj)); } - if (m_cudaMemory) - { - SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory)); - } if (m_cudaArray) { SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray)); @@ -98,10 +85,8 @@ public: return resource ? resource->m_cudaTexObj : CUtexObject(0); } -protected: // This is an opaque type, that's backed by a long long CUtexObject m_cudaTexObj = CUtexObject(); - CUdeviceptr m_cudaMemory = CUdeviceptr(); CUarray m_cudaArray = CUarray(); }; @@ -140,7 +125,7 @@ public: { release(); SLANG_ASSERT(m_stream == nullptr); - SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&m_stream, flags)); + SLANG_CUDA_RETURN_ON_FAIL(cuStreamCreate(&m_stream, flags)); return SLANG_OK; } @@ -148,7 +133,7 @@ public: { if (m_stream) { - SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(m_stream)); + SLANG_CUDA_RETURN_ON_FAIL(cuStreamSynchronize(m_stream)); } else { @@ -162,7 +147,7 @@ public: if (m_stream) { sync(); - SLANG_CUDA_ASSERT_ON_FAIL(cudaStreamDestroy(m_stream)); + SLANG_CUDA_ASSERT_ON_FAIL(cuStreamDestroy(m_stream)); m_stream = nullptr; } } @@ -171,9 +156,9 @@ public: ~ScopeCUDAStream() { release(); } - operator cudaStream_t () const { return m_stream; } + operator CUstream () const { return m_stream; } - cudaStream_t m_stream; + CUstream m_stream; }; @@ -408,10 +393,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp case slang::TypeReflection::Kind::ParameterBlock: { // We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first - - void* cudaMem = nullptr; - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes)); - value->m_target = new CUDAResource(cudaMem); + RefPtr<CUDAResource> resource = new CUDAResource; + SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes)); + value->m_target = resource; break; } case slang::TypeReflection::Kind::Resource: @@ -419,11 +403,15 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp auto type = typeLayout->getType(); auto shape = type->getResourceShape(); - //auto access = type->getResourceAccess(); + auto access = type->getResourceAccess(); - switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK) + auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK; + + switch (baseShape) { + case SLANG_TEXTURE_1D: case SLANG_TEXTURE_2D: + case SLANG_TEXTURE_3D: { SLANG_ASSERT(value->m_userIndex >= 0); auto& srcEntry = entries[value->m_userIndex]; @@ -439,18 +427,38 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp const auto& textureDesc = srcEntry.textureDesc; int width = textureDesc.size; - int height = textureDesc.size; + int height = 1; + int depth = 1; + switch (baseShape) + { + case SLANG_TEXTURE_1D: break; + case SLANG_TEXTURE_2D: + { + height = textureDesc.size; + break; + } + case SLANG_TEXTURE_3D: + { + height = textureDesc.size; + depth = textureDesc.size; + break; + } + } + TextureData texData; generateTextureData(texData, textureDesc); + RefPtr<CUDATextureResource> tex = new CUDATextureResource; + size_t elementSize = 0; - CUarray cudaArray; { CUDA_ARRAY_DESCRIPTOR arrayDesc; arrayDesc.Width = width; - arrayDesc.Height = height; + + // Width, and Height are the width, and height of the CUDA array (in elements); the CUDA array is one-dimensional if height is 0, two-dimensional otherwise; + arrayDesc.Height = (baseShape == SLANG_TEXTURE_1D) ? 0 : height; switch (textureDesc.format) { @@ -476,37 +484,44 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp } // Allocate the array - SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&cudaArray, &arrayDesc)); - } - - CUdeviceptr cudaMemory = (CUdeviceptr)nullptr; - { - const size_t size = width * height * elementSize; - // allocate device memory for result - SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&cudaMemory, size)); + SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc)); } + switch (baseShape) { - CUDA_MEMCPY2D copyParam; - memset(©Param, 0, sizeof(copyParam)); - copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; - copyParam.dstArray = cudaArray; - copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; - copyParam.srcHost = texData.dataBuffer[0].getBuffer(); - copyParam.srcPitch = width * elementSize; - copyParam.WidthInBytes = copyParam.srcPitch; - copyParam.Height = height; - SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); + case SLANG_TEXTURE_1D: + case SLANG_TEXTURE_2D: + { + // TODO(JS): + // Not clear how the copy should be done for 1D, but seeing as it is copying to an 'array' + // doing it with cuMemcpy2D is appropriate. + // Not clear if the height should be 0 or 1. The array required it to be 0. + CUDA_MEMCPY2D copyParam; + memset(©Param, 0, sizeof(copyParam)); + copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; + copyParam.dstArray = tex->m_cudaArray; + copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; + copyParam.srcHost = texData.dataBuffer[0].getBuffer(); + copyParam.srcPitch = width * elementSize; + copyParam.WidthInBytes = copyParam.srcPitch; + copyParam.Height = height; + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(©Param)); + break; + } + case SLANG_TEXTURE_3D: + { + SLANG_ASSERT(!"Not implemented"); + break; + } } // set texture parameters - CUtexObject cudaTexObj; { CUDA_RESOURCE_DESC resDesc; memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC)); resDesc.resType = CU_RESOURCE_TYPE_ARRAY; - resDesc.res.array.hArray = cudaArray; + resDesc.res.array.hArray = tex->m_cudaArray; CUDA_TEXTURE_DESC texDesc; memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC)); @@ -516,14 +531,13 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&cudaTexObj, &resDesc, &texDesc, nullptr)); + SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr)); } - value->m_target = new CUDATextureResource(cudaTexObj, cudaMemory, cudaArray); + value->m_target = tex; break; } - case SLANG_TEXTURE_1D: - case SLANG_TEXTURE_3D: + case SLANG_TEXTURE_CUBE: case SLANG_TEXTURE_BUFFER: { @@ -536,10 +550,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp case SLANG_STRUCTURED_BUFFER: { // On CPU we just use the memory in the BindSet buffer, so don't need to create anything - void* cudaMem = nullptr; - SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes)); - value->m_target = new CUDAResource(cudaMem); - + RefPtr<CUDAResource> resource = new CUDAResource; + SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes)); + value->m_target = resource; break; } } @@ -572,7 +585,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp auto elementCount = int(typeLayout->getElementCount()); if (elementCount == 0) { - CUDAComputeUtil::Array array = { nullptr, 0 }; + CUDAComputeUtil::Array array = { CUdeviceptr(), 0 }; auto resource = CUDAResource::getCUDAResource(value); if (resource) { @@ -588,7 +601,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp case slang::TypeReflection::Kind::ParameterBlock: { // These map down to just pointers - *location.getUniform<void*>() = CUDAResource::getCUDAData(value); + *location.getUniform<CUdeviceptr>() = CUDAResource::getCUDAData(value); break; } case slang::TypeReflection::Kind::Resource: @@ -602,7 +615,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp { case SLANG_STRUCTURED_BUFFER: { - CUDAComputeUtil::StructuredBuffer buffer = { nullptr, 0 }; + CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 }; auto resource = CUDAResource::getCUDAResource(value); if (resource) { @@ -615,7 +628,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp } case SLANG_BYTE_ADDRESS_BUFFER: { - CUDAComputeUtil::ByteAddressBuffer buffer = { nullptr, 0 }; + CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 }; auto resource = CUDAResource::getCUDAResource(value); if (resource) @@ -649,11 +662,11 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp const auto& values = bindSet.getValues(); for (BindSet::Value* value : values) { - void* cudaMem = CUDAResource::getCUDAData(value); + CUdeviceptr cudaMem = CUDAResource::getCUDAData(value); if (value && value->m_data && cudaMem) { // Okay copy the data over... - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(cudaMem, value->m_data, value->m_sizeInBytes, cudaMemcpyHostToDevice)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes)); } } } @@ -670,8 +683,8 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel)); // Work out the args - void* uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue()); - void* entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue()); + CUdeviceptr uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue()); + CUdeviceptr entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue()); // NOTE! These are pointers to the cuda memory pointers void* args[] = { &entryPointCUDAData , &uniformCUDAData }; @@ -683,10 +696,10 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp auto cudaLaunchResult = cuLaunchKernel(kernel, dispatchSize[0], dispatchSize[1], dispatchSize[2], int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block - 0, // Shared memory size - cudaStream, // Stream. 0 is no stream. - args, // Args - nullptr); // extra + 0, // Shared memory size + cudaStream, // Stream. 0 is no stream. + args, // Args + nullptr); // extra SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult); @@ -707,11 +720,11 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp if (entry.isOutput) { // Copy back to CPU memory - void* cudaMem = CUDAResource::getCUDAData(value); + CUdeviceptr cudaMem = CUDAResource::getCUDAData(value); if (value && value->m_data && cudaMem) { // Okay copy the data back... - SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(value->m_data, cudaMem, value->m_sizeInBytes, cudaMemcpyDeviceToHost)); + SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes)); } } } diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h index 8965f5037..f1ca65502 100644 --- a/tools/render-test/cuda/cuda-compute-util.h +++ b/tools/render-test/cuda/cuda-compute-util.h @@ -8,22 +8,26 @@ namespace renderer_test { + struct CUDAComputeUtil { + // Define here, so we don't need to include the cude header + typedef size_t CUdeviceptr; + /// NOTE! MUST match up to definitions in the CUDA prelude struct ByteAddressBuffer { - void* data; + CUdeviceptr data; size_t sizeInBytes; }; struct StructuredBuffer { - void* data; + CUdeviceptr data; size_t count; }; struct Array { - void* data; + CUdeviceptr data; size_t count; }; |
