summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--prelude/slang-cpp-types.h17
-rw-r--r--source/slang/core.meta.slang45
-rw-r--r--source/slang/core.meta.slang.h47
-rw-r--r--source/slang/hlsl.meta.slang5
-rw-r--r--source/slang/hlsl.meta.slang.h7
-rw-r--r--source/slang/slang-ir-type-set.cpp92
-rw-r--r--source/slang/slang-ir-type-set.h5
-rw-r--r--tests/compute/texture-simple.slang31
-rw-r--r--tests/compute/texture-simple.slang.expected.txt4
-rw-r--r--tools/render-test/cpu-compute-util.cpp110
-rw-r--r--tools/render-test/cuda/cuda-compute-util.cpp165
-rw-r--r--tools/render-test/cuda/cuda-compute-util.h10
12 files changed, 371 insertions, 167 deletions
diff --git a/prelude/slang-cpp-types.h b/prelude/slang-cpp-types.h
index 67db607f6..2238727c5 100644
--- a/prelude/slang-cpp-types.h
+++ b/prelude/slang-cpp-types.h
@@ -228,6 +228,23 @@ struct SamplerComparisonState
// Texture
+struct ITexture1D
+{
+ virtual void Load(const int2& v, void* out) = 0;
+ virtual void Sample(SamplerState samplerState, float loc, void* out) = 0;
+ virtual void SampleLevel(SamplerState samplerState, float loc, float level, void* out) = 0;
+};
+
+template <typename T>
+struct Texture1D
+{
+ T Load(const int2& v) const { T out; texture->Load(v, &out); return out; }
+ T Sample(SamplerState samplerState, float v) const { T out; texture->Sample(samplerState, v, &out); return out; }
+ T SampleLevel(SamplerState samplerState, float v, float level) { T out; texture->SampleLevel(samplerState, v, level, &out); return out; }
+
+ ITexture1D* texture;
+};
+
struct ITexture2D
{
virtual void Load(const int3& v, void* out) = 0;
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 6efb383fa..450cc4512 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -897,19 +897,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
if( baseShape != TextureFlavor::Shape::ShapeCube )
{
sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0";
- if (kBaseTextureTypes[tt].coordCount == 1)
- {
- sb << ", $2";
- }
- else
+ for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
{
- for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+ sb << ", ($2)";
+ if (kBaseTextureTypes[tt].coordCount > 1)
{
- sb << ", ($2)." << char(i + 'x');
+ sb << '.' << char(i + 'x');
}
}
sb << ")\")\n";
}
+ else
+ {
+ sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
+ }
sb << "T Sample(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n";
@@ -1028,7 +1029,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n";
}
-
+ // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on
+ // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy.
+ // I could just assume 0 for elements not supplied, and ignore z. For now will just leave
sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n";
sb << "T SampleGrad(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
@@ -1053,23 +1056,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
// CUDA
if (!isArray)
{
- sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
- for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+ if( baseShape != TextureFlavor::Shape::ShapeCube )
{
- sb << ", $2";
- if (kBaseTextureTypes[tt].coordCount > 1)
+ sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
+ for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
{
- sb << '.' << char(i + 'x');
+ sb << ", ($2)";
+ if (kBaseTextureTypes[tt].coordCount > 1)
+ {
+ sb << '.' << char(i + 'x');
+ }
}
+ sb << ", $3)\")\n";
+ }
+ else
+ {
+ sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
}
- sb << ", $3)\")\n";
}
sb << "T SampleLevel(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
sb << "float level);\n";
-
if( baseShape != TextureFlavor::Shape::ShapeCube )
{
sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n";
@@ -1145,6 +1154,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
EMIT_LINE_DIRECTIVE();
sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n";
+ if (kBaseTextureTypes[tt].coordCount == 2)
+ {
+ // Gather only works on 2D in CUDA
+ // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4."
+ sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n";
+ }
sb << outputType << " Gather" << componentName << "(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n";
diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h
index 5f185ca8f..cca8f2e51 100644
--- a/source/slang/core.meta.slang.h
+++ b/source/slang/core.meta.slang.h
@@ -918,19 +918,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
if( baseShape != TextureFlavor::Shape::ShapeCube )
{
sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0";
- if (kBaseTextureTypes[tt].coordCount == 1)
- {
- sb << ", $2";
- }
- else
+ for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
{
- for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+ sb << ", ($2)";
+ if (kBaseTextureTypes[tt].coordCount > 1)
{
- sb << ", ($2)." << char(i + 'x');
+ sb << '.' << char(i + 'x');
}
}
sb << ")\")\n";
}
+ else
+ {
+ sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
+ }
sb << "T Sample(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n";
@@ -1049,7 +1050,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n";
}
-
+ // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on
+ // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy.
+ // I could just assume 0 for elements not supplied, and ignore z. For now will just leave
sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n";
sb << "T SampleGrad(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
@@ -1074,23 +1077,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
// CUDA
if (!isArray)
{
- sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
- for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+ if( baseShape != TextureFlavor::Shape::ShapeCube )
{
- sb << ", $2";
- if (kBaseTextureTypes[tt].coordCount > 1)
+ sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
+ for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
{
- sb << '.' << char(i + 'x');
+ sb << ", ($2)";
+ if (kBaseTextureTypes[tt].coordCount > 1)
+ {
+ sb << '.' << char(i + 'x');
+ }
}
+ sb << ", $3)\")\n";
+ }
+ else
+ {
+ sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
}
- sb << ", $3)\")\n";
}
sb << "T SampleLevel(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
sb << "float level);\n";
-
if( baseShape != TextureFlavor::Shape::ShapeCube )
{
sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n";
@@ -1166,6 +1175,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
EMIT_LINE_DIRECTIVE();
sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n";
+ if (kBaseTextureTypes[tt].coordCount == 2)
+ {
+ // Gather only works on 2D in CUDA
+ // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4."
+ sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n";
+ }
sb << outputType << " Gather" << componentName << "(SamplerState s, ";
sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n";
@@ -1299,7 +1314,7 @@ for (auto op : binaryOps)
sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n";
}
}
-SLANG_RAW("#line 1281 \"core.meta.slang\"")
+SLANG_RAW("#line 1296 \"core.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("// Specialized function\n")
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index f8ae340bc..f7707cc6d 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -347,8 +347,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> bool all(matrix<T,N,M> x)
// Barrier for writes to all memory spaces (HLSL SM 5.0)
__target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
-// TODO(JS): Doesn't seem to be weaker form of sync, so use this?
-__target_intrinsic(cuda, "__syncthreads()")
+__target_intrinsic(cuda, "__threadfence()")
void AllMemoryBarrier();
// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)
@@ -648,6 +647,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int> T determinant(matrix<T,N,
// Barrier for device memory
__target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
+__target_intrinsic(cuda, "__threadfence()")
void DeviceMemoryBarrier();
__target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()")
@@ -814,6 +814,7 @@ float2 GetRenderTargetSamplePosition(int Index);
// Group memory barrier
__target_intrinsic(glsl, "groupMemoryBarrier")
+__target_intrinsic(cuda, "__threadfence_block")
void GroupMemoryBarrier();
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index 215d18670..c0b875df5 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -396,8 +396,7 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> bool all(matri
SLANG_RAW("\n")
SLANG_RAW("// Barrier for writes to all memory spaces (HLSL SM 5.0)\n")
SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n")
-SLANG_RAW("// TODO(JS): Doesn't seem to be weaker form of sync, so use this?\n")
-SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n")
SLANG_RAW("void AllMemoryBarrier();\n")
SLANG_RAW("\n")
SLANG_RAW("// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)\n")
@@ -724,6 +723,7 @@ SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> T determinant(
SLANG_RAW("\n")
SLANG_RAW("// Barrier for device memory\n")
SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n")
SLANG_RAW("void DeviceMemoryBarrier();\n")
SLANG_RAW("\n")
SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n")
@@ -890,6 +890,7 @@ SLANG_RAW("float2 GetRenderTargetSamplePosition(int Index);\n")
SLANG_RAW("\n")
SLANG_RAW("// Group memory barrier\n")
SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence_block\")\n")
SLANG_RAW("void GroupMemoryBarrier();\n")
SLANG_RAW("\n")
SLANG_RAW("\n")
@@ -1641,7 +1642,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
sb << "};\n";
}
-SLANG_RAW("#line 1568 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("\n")
diff --git a/source/slang/slang-ir-type-set.cpp b/source/slang/slang-ir-type-set.cpp
index a4ebf8242..e5271698c 100644
--- a/source/slang/slang-ir-type-set.cpp
+++ b/source/slang/slang-ir-type-set.cpp
@@ -115,50 +115,74 @@ IRInst* IRTypeSet::cloneInst(IRInst* inst)
clone = m_builder.getStringValue(stringLit->getStringSlice());
break;
}
- default:
+ case kIROp_VectorType:
{
- if (IRBasicType::isaImpl(inst->op))
+ auto vecType = static_cast<IRVectorType*>(inst);
+ const Index elementCount = Index(GetIntVal(vecType->getElementCount()));
+
+ if (elementCount <= 1)
{
- clone = m_builder.getType(inst->op);
+ clone = cloneType(vecType->getElementType());
}
- else
+ break;
+ }
+ case kIROp_MatrixType:
+ {
+ auto matType = static_cast<IRMatrixType*>(inst);
+ const Index columnCount = Index(GetIntVal(matType->getColumnCount()));
+ const Index rowCount = Index(GetIntVal(matType->getRowCount()));
+
+ if (columnCount <= 1 && rowCount <= 1)
{
- IRType* irType = dynamicCast<IRType>(inst);
- if (irType)
- {
- auto clonedType = cloneType(inst->getFullType());
- Index operandCount = Index(inst->getOperandCount());
+ clone = cloneType(matType->getElementType());
+ }
+ break;
+ }
+ default: break;
+ }
+
+ if (!clone)
+ {
+ if (IRBasicType::isaImpl(inst->op))
+ {
+ clone = m_builder.getType(inst->op);
+ }
+ else
+ {
+ IRType* irType = dynamicCast<IRType>(inst);
+ if (irType)
+ {
+ auto clonedType = cloneType(inst->getFullType());
+ Index operandCount = Index(inst->getOperandCount());
- List<IRInst*> cloneOperands;
- cloneOperands.setCount(operandCount);
+ List<IRInst*> cloneOperands;
+ cloneOperands.setCount(operandCount);
- for (Index i = 0; i < operandCount; ++i)
- {
- cloneOperands[i] = cloneInst(inst->getOperand(i));
- }
+ for (Index i = 0; i < operandCount; ++i)
+ {
+ cloneOperands[i] = cloneInst(inst->getOperand(i));
+ }
- //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer());
+ //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer());
- UInt operandCounts[1] = { UInt(operandCount) };
- IRInst*const* listOperands[1] = { cloneOperands.getBuffer() };
+ UInt operandCounts[1] = { UInt(operandCount) };
+ IRInst*const* listOperands[1] = { cloneOperands.getBuffer() };
- clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands);
- }
- else
+ clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands);
+ }
+ else
+ {
+ // This cloning style only works on insts that are not unique
+ auto clonedType = cloneType(inst->getFullType());
+
+ Index operandCount = Index(inst->getOperandCount());
+ clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr);
+ for (Index i = 0; i < operandCount; ++i)
{
- // This cloning style only works on insts that are not unique
- auto clonedType = cloneType(inst->getFullType());
-
- Index operandCount = Index(inst->getOperandCount());
- clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr);
- for (Index i = 0; i < operandCount; ++i)
- {
- auto cloneOperand = cloneInst(inst->getOperand(i));
- clone->getOperands()[i].init(clone, cloneOperand);
- }
+ auto cloneOperand = cloneInst(inst->getOperand(i));
+ clone->getOperands()[i].init(clone, cloneOperand);
}
}
- break;
}
}
@@ -226,6 +250,10 @@ void IRTypeSet::getTypes(Kind kind, List<IRType*>& outTypes) const
IRType* IRTypeSet::addVectorType(IRType* inElementType, int colsCount)
{
IRType* elementType = cloneType(inElementType);
+ if (colsCount == 1)
+ {
+ return elementType;
+ }
return m_builder.getVectorType(elementType, m_builder.getIntValue(m_builder.getIntType(), colsCount));
}
diff --git a/source/slang/slang-ir-type-set.h b/source/slang/slang-ir-type-set.h
index 09abdf2ad..958d71cf1 100644
--- a/source/slang/slang-ir-type-set.h
+++ b/source/slang/slang-ir-type-set.h
@@ -34,7 +34,10 @@ works, but probably needs to be handled in a better way. The better way may invo
enabled in other code generation and making de-duping possible in emit code.
Note that one pro for this approach is that it does not alter the source module. That as it stands it's not necessary
-for the source module to be immutable, because it is created for emitting and then discarded.
+for the source module to be immutable, because it is created for emitting and then discarded.
+
+NOTE! That Vector<X, 1> or Matrix<X, 1, 1> will be turned into the type X.
+
*/
class IRTypeSet
{
diff --git a/tests/compute/texture-simple.slang b/tests/compute/texture-simple.slang
new file mode 100644
index 000000000..040af2784
--- /dev/null
+++ b/tests/compute/texture-simple.slang
@@ -0,0 +1,31 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil
+// TODO(JS): Doesn't work on vk currently
+//DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT: Texture1D(size=4, content = one):name t1D
+Texture1D<float> t1D;
+//TEST_INPUT: Texture2D(size=4, content = one):name t2D
+Texture2D<float> t2D;
+
+//TEST_INPUT: Sampler:name samplerState
+SamplerState samplerState;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(4, 4, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ int idx = dispatchThreadID.x;
+ float u = idx * (1.0f / 4);
+
+ float val = 0.0f;
+ val += t1D.SampleLevel(samplerState, u, 0);
+ val += t2D.SampleLevel(samplerState, float2(u, u), 0);
+
+ outputBuffer[idx] = val;
+}
diff --git a/tests/compute/texture-simple.slang.expected.txt b/tests/compute/texture-simple.slang.expected.txt
new file mode 100644
index 000000000..f5cf6fb10
--- /dev/null
+++ b/tests/compute/texture-simple.slang.expected.txt
@@ -0,0 +1,4 @@
+40000000
+40000000
+40000000
+40000000
diff --git a/tools/render-test/cpu-compute-util.cpp b/tools/render-test/cpu-compute-util.cpp
index 2bb0baf88..d0907482c 100644
--- a/tools/render-test/cpu-compute-util.cpp
+++ b/tools/render-test/cpu-compute-util.cpp
@@ -16,48 +16,106 @@ namespace renderer_test {
using namespace Slang;
template <int COUNT>
-struct OneTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D
+struct ValueTexture2D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture2D
{
- void setOne(void* out)
+ void set(void* out)
{
float* dst = (float*)out;
for (int i = 0; i < COUNT; ++i)
{
- dst[i] = 1.0f;
+ dst[i] = m_value;
}
}
virtual void Load(const CPPPrelude::int3& v, void* out) SLANG_OVERRIDE
{
- setOne(out);
+ set(out);
}
virtual void Sample(CPPPrelude::SamplerState samplerState, const CPPPrelude::float2& loc, void* out) SLANG_OVERRIDE
{
- setOne(out);
+ set(out);
}
virtual void SampleLevel(CPPPrelude::SamplerState samplerState, const CPPPrelude::float2& loc, float level, void* out) SLANG_OVERRIDE
{
- setOne(out);
+ set(out);
}
- OneTexture2D()
+ ValueTexture2D(float value):
+ m_value(value)
{
m_interface = static_cast<CPPPrelude::ITexture2D*>(this);
}
+
+ float m_value;
};
-static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
+template <int COUNT>
+struct ValueTexture1D : public CPUComputeUtil::Resource, public CPPPrelude::ITexture1D
{
- switch (elemCount)
+ void set(void* out)
+ {
+ float* dst = (float*)out;
+ for (int i = 0; i < COUNT; ++i)
+ {
+ dst[i] = m_value;
+ }
+ }
+
+ virtual void Load(const CPPPrelude::int2& v, void* out) SLANG_OVERRIDE
{
- case 1: return new OneTexture2D<1>();
- case 2: return new OneTexture2D<2>();
- case 3: return new OneTexture2D<3>();
- case 4: return new OneTexture2D<4>();
- default: return nullptr;
+ set(out);
}
+ virtual void Sample(CPPPrelude::SamplerState samplerState, float loc, void* out) SLANG_OVERRIDE
+ {
+ set(out);
+ }
+ virtual void SampleLevel(CPPPrelude::SamplerState samplerState, float loc, float level, void* out) SLANG_OVERRIDE
+ {
+ set(out);
+ }
+
+ ValueTexture1D(float value) :
+ m_value(value)
+ {
+ m_interface = static_cast<CPPPrelude::ITexture1D*>(this);
+ }
+
+ float m_value;
+};
+
+static CPUComputeUtil::Resource* _newValueTexture(SlangResourceShape baseShape, int elemCount, float value)
+{
+ switch (baseShape)
+ {
+ case SLANG_TEXTURE_1D:
+ {
+ switch (elemCount)
+ {
+ case 1: return new ValueTexture1D<1>(value);
+ case 2: return new ValueTexture1D<2>(value);
+ case 3: return new ValueTexture1D<3>(value);
+ case 4: return new ValueTexture1D<4>(value);
+ default: break;
+ }
+ break;
+ }
+ case SLANG_TEXTURE_2D:
+ {
+ switch (elemCount)
+ {
+ case 1: return new ValueTexture2D<1>(value);
+ case 2: return new ValueTexture2D<2>(value);
+ case 3: return new ValueTexture2D<3>(value);
+ case 4: return new ValueTexture2D<4>(value);
+ default: break;
+ }
+ }
+ default: break;
+ }
+ return nullptr;
}
+
/* static */SlangResult CPUComputeUtil::calcBindings(const ShaderCompilerUtil::OutputAndLayout& compilationAndLayout, Context& outContext)
{
auto request = compilationAndLayout.output.request;
@@ -109,13 +167,16 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
//auto access = type->getResourceAccess();
- switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
+ auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+ switch (baseShape)
{
+ case SLANG_TEXTURE_1D:
case SLANG_TEXTURE_2D:
{
SLANG_ASSERT(value->m_userIndex >= 0);
auto& srcEntry = layout.entries[value->m_userIndex];
+
// TODO(JS):
// We should use the srcEntry to determine what data to store in the texture,
// it's dimensions etc. For now we just support it being 1.
@@ -128,12 +189,23 @@ static CPUComputeUtil::Resource* _newOneTexture2D(int elemCount)
count = int(typeReflection->getElementCount());
}
- // TODO(JS): Should use the input setup to work how to create this texture
- // Store the target specific value
- value->m_target = _newOneTexture2D(count);
+ switch (srcEntry.textureDesc.content)
+ {
+ case InputTextureContent::One:
+ {
+ value->m_target = _newValueTexture(baseShape, count, 1.0f);
+ break;
+ }
+ case InputTextureContent::Zero:
+ {
+ value->m_target = _newValueTexture(baseShape, count, 0.0f);
+ break;
+ }
+ default: break;
+ }
break;
}
- case SLANG_TEXTURE_1D:
+
case SLANG_TEXTURE_3D:
case SLANG_TEXTURE_CUBE:
case SLANG_TEXTURE_BUFFER:
diff --git a/tools/render-test/cuda/cuda-compute-util.cpp b/tools/render-test/cuda/cuda-compute-util.cpp
index c6862d2d3..f471c2961 100644
--- a/tools/render-test/cuda/cuda-compute-util.cpp
+++ b/tools/render-test/cuda/cuda-compute-util.cpp
@@ -9,6 +9,7 @@
#include "../bind-location.h"
#include <cuda.h>
+
#include <cuda_runtime_api.h>
namespace renderer_test {
@@ -33,14 +34,11 @@ public:
typedef RefObject Super;
/// Dtor
- CUDAResource(): m_cudaMemory(nullptr) {}
- CUDAResource(void* cudaMemory): m_cudaMemory(cudaMemory) {}
-
~CUDAResource()
{
if (m_cudaMemory)
{
- SLANG_CUDA_ASSERT_ON_FAIL(cudaFree(m_cudaMemory));
+ SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory));
}
}
@@ -49,13 +47,13 @@ public:
return value ? dynamic_cast<CUDAResource*>(value->m_target.Ptr()) : nullptr;
}
/// Helper function to get the cuda memory pointer when given a value
- static void* getCUDAData(BindSet::Value* value)
+ static CUdeviceptr getCUDAData(BindSet::Value* value)
{
auto resource = getCUDAResource(value);
- return resource ? resource->m_cudaMemory : nullptr;
+ return resource ? resource->m_cudaMemory : CUdeviceptr();
}
- void* m_cudaMemory;
+ CUdeviceptr m_cudaMemory = CUdeviceptr();
};
class CUDATextureResource : public RefObject
@@ -63,23 +61,12 @@ class CUDATextureResource : public RefObject
public:
typedef RefObject Super;
- CUDATextureResource() {}
- CUDATextureResource(CUtexObject cudaTexObj, CUdeviceptr cudaMemory, CUarray cudaArray):
- m_cudaTexObj(cudaTexObj),
- m_cudaMemory(cudaMemory),
- m_cudaArray(cudaArray)
- {
- }
~CUDATextureResource()
{
if (m_cudaTexObj)
{
SLANG_CUDA_ASSERT_ON_FAIL(cuTexObjectDestroy(m_cudaTexObj));
}
- if (m_cudaMemory)
- {
- SLANG_CUDA_ASSERT_ON_FAIL(cuMemFree(m_cudaMemory));
- }
if (m_cudaArray)
{
SLANG_CUDA_ASSERT_ON_FAIL(cuArrayDestroy(m_cudaArray));
@@ -98,10 +85,8 @@ public:
return resource ? resource->m_cudaTexObj : CUtexObject(0);
}
-protected:
// This is an opaque type, that's backed by a long long
CUtexObject m_cudaTexObj = CUtexObject();
- CUdeviceptr m_cudaMemory = CUdeviceptr();
CUarray m_cudaArray = CUarray();
};
@@ -140,7 +125,7 @@ public:
{
release();
SLANG_ASSERT(m_stream == nullptr);
- SLANG_CUDA_RETURN_ON_FAIL(cudaStreamCreateWithFlags(&m_stream, flags));
+ SLANG_CUDA_RETURN_ON_FAIL(cuStreamCreate(&m_stream, flags));
return SLANG_OK;
}
@@ -148,7 +133,7 @@ public:
{
if (m_stream)
{
- SLANG_CUDA_RETURN_ON_FAIL(cudaStreamSynchronize(m_stream));
+ SLANG_CUDA_RETURN_ON_FAIL(cuStreamSynchronize(m_stream));
}
else
{
@@ -162,7 +147,7 @@ public:
if (m_stream)
{
sync();
- SLANG_CUDA_ASSERT_ON_FAIL(cudaStreamDestroy(m_stream));
+ SLANG_CUDA_ASSERT_ON_FAIL(cuStreamDestroy(m_stream));
m_stream = nullptr;
}
}
@@ -171,9 +156,9 @@ public:
~ScopeCUDAStream() { release(); }
- operator cudaStream_t () const { return m_stream; }
+ operator CUstream () const { return m_stream; }
- cudaStream_t m_stream;
+ CUstream m_stream;
};
@@ -408,10 +393,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
case slang::TypeReflection::Kind::ParameterBlock:
{
// We can construct the buffers. We can't copy into yet, as we need to set all of the bindings first
-
- void* cudaMem = nullptr;
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
- value->m_target = new CUDAResource(cudaMem);
+ RefPtr<CUDAResource> resource = new CUDAResource;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+ value->m_target = resource;
break;
}
case slang::TypeReflection::Kind::Resource:
@@ -419,11 +403,15 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
auto type = typeLayout->getType();
auto shape = type->getResourceShape();
- //auto access = type->getResourceAccess();
+ auto access = type->getResourceAccess();
- switch (shape & SLANG_RESOURCE_BASE_SHAPE_MASK)
+ auto baseShape = shape & SLANG_RESOURCE_BASE_SHAPE_MASK;
+
+ switch (baseShape)
{
+ case SLANG_TEXTURE_1D:
case SLANG_TEXTURE_2D:
+ case SLANG_TEXTURE_3D:
{
SLANG_ASSERT(value->m_userIndex >= 0);
auto& srcEntry = entries[value->m_userIndex];
@@ -439,18 +427,38 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
const auto& textureDesc = srcEntry.textureDesc;
int width = textureDesc.size;
- int height = textureDesc.size;
+ int height = 1;
+ int depth = 1;
+ switch (baseShape)
+ {
+ case SLANG_TEXTURE_1D: break;
+ case SLANG_TEXTURE_2D:
+ {
+ height = textureDesc.size;
+ break;
+ }
+ case SLANG_TEXTURE_3D:
+ {
+ height = textureDesc.size;
+ depth = textureDesc.size;
+ break;
+ }
+ }
+
TextureData texData;
generateTextureData(texData, textureDesc);
+ RefPtr<CUDATextureResource> tex = new CUDATextureResource;
+
size_t elementSize = 0;
- CUarray cudaArray;
{
CUDA_ARRAY_DESCRIPTOR arrayDesc;
arrayDesc.Width = width;
- arrayDesc.Height = height;
+
+ // Width, and Height are the width, and height of the CUDA array (in elements); the CUDA array is one-dimensional if height is 0, two-dimensional otherwise;
+ arrayDesc.Height = (baseShape == SLANG_TEXTURE_1D) ? 0 : height;
switch (textureDesc.format)
{
@@ -476,37 +484,44 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
}
// Allocate the array
- SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&cudaArray, &arrayDesc));
- }
-
- CUdeviceptr cudaMemory = (CUdeviceptr)nullptr;
- {
- const size_t size = width * height * elementSize;
- // allocate device memory for result
- SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&cudaMemory, size));
+ SLANG_CUDA_RETURN_ON_FAIL(cuArrayCreate(&tex->m_cudaArray, &arrayDesc));
}
+ switch (baseShape)
{
- CUDA_MEMCPY2D copyParam;
- memset(&copyParam, 0, sizeof(copyParam));
- copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
- copyParam.dstArray = cudaArray;
- copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
- copyParam.srcHost = texData.dataBuffer[0].getBuffer();
- copyParam.srcPitch = width * elementSize;
- copyParam.WidthInBytes = copyParam.srcPitch;
- copyParam.Height = height;
- SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+ case SLANG_TEXTURE_1D:
+ case SLANG_TEXTURE_2D:
+ {
+ // TODO(JS):
+ // Not clear how the copy should be done for 1D, but seeing as it is copying to an 'array'
+ // doing it with cuMemcpy2D is appropriate.
+ // Not clear if the height should be 0 or 1. The array required it to be 0.
+ CUDA_MEMCPY2D copyParam;
+ memset(&copyParam, 0, sizeof(copyParam));
+ copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+ copyParam.dstArray = tex->m_cudaArray;
+ copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+ copyParam.srcHost = texData.dataBuffer[0].getBuffer();
+ copyParam.srcPitch = width * elementSize;
+ copyParam.WidthInBytes = copyParam.srcPitch;
+ copyParam.Height = height;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpy2D(&copyParam));
+ break;
+ }
+ case SLANG_TEXTURE_3D:
+ {
+ SLANG_ASSERT(!"Not implemented");
+ break;
+ }
}
// set texture parameters
- CUtexObject cudaTexObj;
{
CUDA_RESOURCE_DESC resDesc;
memset(&resDesc, 0, sizeof(CUDA_RESOURCE_DESC));
resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- resDesc.res.array.hArray = cudaArray;
+ resDesc.res.array.hArray = tex->m_cudaArray;
CUDA_TEXTURE_DESC texDesc;
memset(&texDesc, 0, sizeof(CUDA_TEXTURE_DESC));
@@ -516,14 +531,13 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
texDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
- SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&cudaTexObj, &resDesc, &texDesc, nullptr));
+ SLANG_CUDA_RETURN_ON_FAIL(cuTexObjectCreate(&tex->m_cudaTexObj, &resDesc, &texDesc, nullptr));
}
- value->m_target = new CUDATextureResource(cudaTexObj, cudaMemory, cudaArray);
+ value->m_target = tex;
break;
}
- case SLANG_TEXTURE_1D:
- case SLANG_TEXTURE_3D:
+
case SLANG_TEXTURE_CUBE:
case SLANG_TEXTURE_BUFFER:
{
@@ -536,10 +550,9 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
case SLANG_STRUCTURED_BUFFER:
{
// On CPU we just use the memory in the BindSet buffer, so don't need to create anything
- void* cudaMem = nullptr;
- SLANG_CUDA_RETURN_ON_FAIL(cudaMalloc(&cudaMem, value->m_sizeInBytes));
- value->m_target = new CUDAResource(cudaMem);
-
+ RefPtr<CUDAResource> resource = new CUDAResource;
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemAlloc(&resource->m_cudaMemory, value->m_sizeInBytes));
+ value->m_target = resource;
break;
}
}
@@ -572,7 +585,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
auto elementCount = int(typeLayout->getElementCount());
if (elementCount == 0)
{
- CUDAComputeUtil::Array array = { nullptr, 0 };
+ CUDAComputeUtil::Array array = { CUdeviceptr(), 0 };
auto resource = CUDAResource::getCUDAResource(value);
if (resource)
{
@@ -588,7 +601,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
case slang::TypeReflection::Kind::ParameterBlock:
{
// These map down to just pointers
- *location.getUniform<void*>() = CUDAResource::getCUDAData(value);
+ *location.getUniform<CUdeviceptr>() = CUDAResource::getCUDAData(value);
break;
}
case slang::TypeReflection::Kind::Resource:
@@ -602,7 +615,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
{
case SLANG_STRUCTURED_BUFFER:
{
- CUDAComputeUtil::StructuredBuffer buffer = { nullptr, 0 };
+ CUDAComputeUtil::StructuredBuffer buffer = { CUdeviceptr(), 0 };
auto resource = CUDAResource::getCUDAResource(value);
if (resource)
{
@@ -615,7 +628,7 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
}
case SLANG_BYTE_ADDRESS_BUFFER:
{
- CUDAComputeUtil::ByteAddressBuffer buffer = { nullptr, 0 };
+ CUDAComputeUtil::ByteAddressBuffer buffer = { CUdeviceptr(), 0 };
auto resource = CUDAResource::getCUDAResource(value);
if (resource)
@@ -649,11 +662,11 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
const auto& values = bindSet.getValues();
for (BindSet::Value* value : values)
{
- void* cudaMem = CUDAResource::getCUDAData(value);
+ CUdeviceptr cudaMem = CUDAResource::getCUDAData(value);
if (value && value->m_data && cudaMem)
{
// Okay copy the data over...
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(cudaMem, value->m_data, value->m_sizeInBytes, cudaMemcpyHostToDevice));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyHtoD(cudaMem, value->m_data, value->m_sizeInBytes));
}
}
}
@@ -670,8 +683,8 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
SLANG_CUDA_RETURN_ON_FAIL(cuFuncGetAttribute(&sharedSizeInBytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel));
// Work out the args
- void* uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue());
- void* entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue());
+ CUdeviceptr uniformCUDAData = CUDAResource::getCUDAData(bindRoot.getRootValue());
+ CUdeviceptr entryPointCUDAData = CUDAResource::getCUDAData(bindRoot.getEntryPointValue());
// NOTE! These are pointers to the cuda memory pointers
void* args[] = { &entryPointCUDAData , &uniformCUDAData };
@@ -683,10 +696,10 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
auto cudaLaunchResult = cuLaunchKernel(kernel,
dispatchSize[0], dispatchSize[1], dispatchSize[2],
int(numThreadsPerAxis[0]), int(numThreadsPerAxis[1]), int(numThreadsPerAxis[2]), // Threads per block
- 0, // Shared memory size
- cudaStream, // Stream. 0 is no stream.
- args, // Args
- nullptr); // extra
+ 0, // Shared memory size
+ cudaStream, // Stream. 0 is no stream.
+ args, // Args
+ nullptr); // extra
SLANG_CUDA_RETURN_ON_FAIL(cudaLaunchResult);
@@ -707,11 +720,11 @@ static SlangResult _compute(CUcontext context, CUmodule module, const ShaderComp
if (entry.isOutput)
{
// Copy back to CPU memory
- void* cudaMem = CUDAResource::getCUDAData(value);
+ CUdeviceptr cudaMem = CUDAResource::getCUDAData(value);
if (value && value->m_data && cudaMem)
{
// Okay copy the data back...
- SLANG_CUDA_RETURN_ON_FAIL(cudaMemcpy(value->m_data, cudaMem, value->m_sizeInBytes, cudaMemcpyDeviceToHost));
+ SLANG_CUDA_RETURN_ON_FAIL(cuMemcpyDtoH(value->m_data, cudaMem, value->m_sizeInBytes));
}
}
}
diff --git a/tools/render-test/cuda/cuda-compute-util.h b/tools/render-test/cuda/cuda-compute-util.h
index 8965f5037..f1ca65502 100644
--- a/tools/render-test/cuda/cuda-compute-util.h
+++ b/tools/render-test/cuda/cuda-compute-util.h
@@ -8,22 +8,26 @@
namespace renderer_test {
+
struct CUDAComputeUtil
{
+ // Define here, so we don't need to include the cude header
+ typedef size_t CUdeviceptr;
+
/// NOTE! MUST match up to definitions in the CUDA prelude
struct ByteAddressBuffer
{
- void* data;
+ CUdeviceptr data;
size_t sizeInBytes;
};
struct StructuredBuffer
{
- void* data;
+ CUdeviceptr data;
size_t count;
};
struct Array
{
- void* data;
+ CUdeviceptr data;
size_t count;
};