diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-02-14 15:06:35 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-02-14 15:06:35 -0500 |
| commit | 2c097545eaa324a91a035327abad2e8b4fa60469 (patch) | |
| tree | 95fd3890f2bfb0184ddbc7f1008de30698651473 /source | |
| parent | dfd3d263704445b6dcebea54dc47193897548822 (diff) | |
Feature/cuda coverage (#1223)
* Add cubemap support.
* Add CUDA fence instrinsics.
* Added Gather for CUDA.
* Use the CUDA driver API as much as possible.
* * Support 1D texture on CPU
* WIP on 1D texture on CUDA
* Added simplified texture test
* Fix test.
* Improve texture-simple tests.
Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/core.meta.slang | 45 | ||||
| -rw-r--r-- | source/slang/core.meta.slang.h | 47 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 5 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang.h | 7 | ||||
| -rw-r--r-- | source/slang/slang-ir-type-set.cpp | 92 | ||||
| -rw-r--r-- | source/slang/slang-ir-type-set.h | 5 |
6 files changed, 132 insertions, 69 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang index 6efb383fa..450cc4512 100644 --- a/source/slang/core.meta.slang +++ b/source/slang/core.meta.slang @@ -897,19 +897,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0"; - if (kBaseTextureTypes[tt].coordCount == 1) - { - sb << ", $2"; - } - else + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) { - sb << ", ($2)." << char(i + 'x'); + sb << '.' << char(i + 'x'); } } sb << ")\")\n"; } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; + } sb << "T Sample(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n"; @@ -1028,7 +1029,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n"; } - + // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on + // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy. + // I could just assume 0 for elements not supplied, and ignore z. For now will just leave sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n"; sb << "T SampleGrad(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; @@ -1053,23 +1056,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) // CUDA if (!isArray) { - sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + if( baseShape != TextureFlavor::Shape::ShapeCube ) { - sb << ", $2"; - if (kBaseTextureTypes[tt].coordCount > 1) + sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - sb << '.' << char(i + 'x'); + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) + { + sb << '.' << char(i + 'x'); + } } + sb << ", $3)\")\n"; + } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; } - sb << ", $3)\")\n"; } sb << "T SampleLevel(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; sb << "float level);\n"; - if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n"; @@ -1145,6 +1154,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) EMIT_LINE_DIRECTIVE(); sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n"; + if (kBaseTextureTypes[tt].coordCount == 2) + { + // Gather only works on 2D in CUDA + // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4." + sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n"; + } sb << outputType << " Gather" << componentName << "(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n"; diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h index 5f185ca8f..cca8f2e51 100644 --- a/source/slang/core.meta.slang.h +++ b/source/slang/core.meta.slang.h @@ -918,19 +918,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0"; - if (kBaseTextureTypes[tt].coordCount == 1) - { - sb << ", $2"; - } - else + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) { - sb << ", ($2)." << char(i + 'x'); + sb << '.' << char(i + 'x'); } } sb << ")\")\n"; } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; + } sb << "T Sample(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n"; @@ -1049,7 +1050,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n"; } - + // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on + // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy. + // I could just assume 0 for elements not supplied, and ignore z. For now will just leave sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n"; sb << "T SampleGrad(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; @@ -1074,23 +1077,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) // CUDA if (!isArray) { - sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; - for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) + if( baseShape != TextureFlavor::Shape::ShapeCube ) { - sb << ", $2"; - if (kBaseTextureTypes[tt].coordCount > 1) + sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0"; + for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i) { - sb << '.' << char(i + 'x'); + sb << ", ($2)"; + if (kBaseTextureTypes[tt].coordCount > 1) + { + sb << '.' << char(i + 'x'); + } } + sb << ", $3)\")\n"; + } + else + { + sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n"; } - sb << ", $3)\")\n"; } sb << "T SampleLevel(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, "; sb << "float level);\n"; - if( baseShape != TextureFlavor::Shape::ShapeCube ) { sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n"; @@ -1166,6 +1175,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt) EMIT_LINE_DIRECTIVE(); sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n"; + if (kBaseTextureTypes[tt].coordCount == 2) + { + // Gather only works on 2D in CUDA + // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4." + sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n"; + } sb << outputType << " Gather" << componentName << "(SamplerState s, "; sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n"; @@ -1299,7 +1314,7 @@ for (auto op : binaryOps) sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n"; } } -SLANG_RAW("#line 1281 \"core.meta.slang\"") +SLANG_RAW("#line 1296 \"core.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("// Specialized function\n") diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index f8ae340bc..f7707cc6d 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -347,8 +347,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> bool all(matrix<T,N,M> x) // Barrier for writes to all memory spaces (HLSL SM 5.0) __target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()") -// TODO(JS): Doesn't seem to be weaker form of sync, so use this? -__target_intrinsic(cuda, "__syncthreads()") +__target_intrinsic(cuda, "__threadfence()") void AllMemoryBarrier(); // Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0) @@ -648,6 +647,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int> T determinant(matrix<T,N, // Barrier for device memory __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()") +__target_intrinsic(cuda, "__threadfence()") void DeviceMemoryBarrier(); __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()") @@ -814,6 +814,7 @@ float2 GetRenderTargetSamplePosition(int Index); // Group memory barrier __target_intrinsic(glsl, "groupMemoryBarrier") +__target_intrinsic(cuda, "__threadfence_block") void GroupMemoryBarrier(); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index 215d18670..c0b875df5 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -396,8 +396,7 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> bool all(matri SLANG_RAW("\n") SLANG_RAW("// Barrier for writes to all memory spaces (HLSL SM 5.0)\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n") -SLANG_RAW("// TODO(JS): Doesn't seem to be weaker form of sync, so use this?\n") -SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n") SLANG_RAW("void AllMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)\n") @@ -724,6 +723,7 @@ SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> T determinant( SLANG_RAW("\n") SLANG_RAW("// Barrier for device memory\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n") SLANG_RAW("void DeviceMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n") @@ -890,6 +890,7 @@ SLANG_RAW("float2 GetRenderTargetSamplePosition(int Index);\n") SLANG_RAW("\n") SLANG_RAW("// Group memory barrier\n") SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__threadfence_block\")\n") SLANG_RAW("void GroupMemoryBarrier();\n") SLANG_RAW("\n") SLANG_RAW("\n") @@ -1641,7 +1642,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1568 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1569 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") diff --git a/source/slang/slang-ir-type-set.cpp b/source/slang/slang-ir-type-set.cpp index a4ebf8242..e5271698c 100644 --- a/source/slang/slang-ir-type-set.cpp +++ b/source/slang/slang-ir-type-set.cpp @@ -115,50 +115,74 @@ IRInst* IRTypeSet::cloneInst(IRInst* inst) clone = m_builder.getStringValue(stringLit->getStringSlice()); break; } - default: + case kIROp_VectorType: { - if (IRBasicType::isaImpl(inst->op)) + auto vecType = static_cast<IRVectorType*>(inst); + const Index elementCount = Index(GetIntVal(vecType->getElementCount())); + + if (elementCount <= 1) { - clone = m_builder.getType(inst->op); + clone = cloneType(vecType->getElementType()); } - else + break; + } + case kIROp_MatrixType: + { + auto matType = static_cast<IRMatrixType*>(inst); + const Index columnCount = Index(GetIntVal(matType->getColumnCount())); + const Index rowCount = Index(GetIntVal(matType->getRowCount())); + + if (columnCount <= 1 && rowCount <= 1) { - IRType* irType = dynamicCast<IRType>(inst); - if (irType) - { - auto clonedType = cloneType(inst->getFullType()); - Index operandCount = Index(inst->getOperandCount()); + clone = cloneType(matType->getElementType()); + } + break; + } + default: break; + } + + if (!clone) + { + if (IRBasicType::isaImpl(inst->op)) + { + clone = m_builder.getType(inst->op); + } + else + { + IRType* irType = dynamicCast<IRType>(inst); + if (irType) + { + auto clonedType = cloneType(inst->getFullType()); + Index operandCount = Index(inst->getOperandCount()); - List<IRInst*> cloneOperands; - cloneOperands.setCount(operandCount); + List<IRInst*> cloneOperands; + cloneOperands.setCount(operandCount); - for (Index i = 0; i < operandCount; ++i) - { - cloneOperands[i] = cloneInst(inst->getOperand(i)); - } + for (Index i = 0; i < operandCount; ++i) + { + cloneOperands[i] = cloneInst(inst->getOperand(i)); + } - //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer()); + //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer()); - UInt operandCounts[1] = { UInt(operandCount) }; - IRInst*const* listOperands[1] = { cloneOperands.getBuffer() }; + UInt operandCounts[1] = { UInt(operandCount) }; + IRInst*const* listOperands[1] = { cloneOperands.getBuffer() }; - clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands); - } - else + clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands); + } + else + { + // This cloning style only works on insts that are not unique + auto clonedType = cloneType(inst->getFullType()); + + Index operandCount = Index(inst->getOperandCount()); + clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr); + for (Index i = 0; i < operandCount; ++i) { - // This cloning style only works on insts that are not unique - auto clonedType = cloneType(inst->getFullType()); - - Index operandCount = Index(inst->getOperandCount()); - clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr); - for (Index i = 0; i < operandCount; ++i) - { - auto cloneOperand = cloneInst(inst->getOperand(i)); - clone->getOperands()[i].init(clone, cloneOperand); - } + auto cloneOperand = cloneInst(inst->getOperand(i)); + clone->getOperands()[i].init(clone, cloneOperand); } } - break; } } @@ -226,6 +250,10 @@ void IRTypeSet::getTypes(Kind kind, List<IRType*>& outTypes) const IRType* IRTypeSet::addVectorType(IRType* inElementType, int colsCount) { IRType* elementType = cloneType(inElementType); + if (colsCount == 1) + { + return elementType; + } return m_builder.getVectorType(elementType, m_builder.getIntValue(m_builder.getIntType(), colsCount)); } diff --git a/source/slang/slang-ir-type-set.h b/source/slang/slang-ir-type-set.h index 09abdf2ad..958d71cf1 100644 --- a/source/slang/slang-ir-type-set.h +++ b/source/slang/slang-ir-type-set.h @@ -34,7 +34,10 @@ works, but probably needs to be handled in a better way. The better way may invo enabled in other code generation and making de-duping possible in emit code. Note that one pro for this approach is that it does not alter the source module. That as it stands it's not necessary -for the source module to be immutable, because it is created for emitting and then discarded. +for the source module to be immutable, because it is created for emitting and then discarded. + +NOTE! That Vector<X, 1> or Matrix<X, 1, 1> will be turned into the type X. + */ class IRTypeSet { |
