Feature/cuda coverage (#1223)

* Add cubemap support. * Add CUDA fence instrinsics. * Added Gather for CUDA. * Use the CUDA driver API as much as possible. * * Support 1D texture on CPU * WIP on 1D texture on CUDA * Added simplified texture test * Fix test. * Improve texture-simple tests. Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
author: jsmall-nvidia <jsmall@nvidia.com> 2020-02-14 15:06:35 -0500
committer: GitHub <noreply@github.com> 2020-02-14 15:06:35 -0500
commit: 2c097545eaa324a91a035327abad2e8b4fa60469 (patch)
tree: 95fd3890f2bfb0184ddbc7f1008de30698651473 /source
parent: dfd3d263704445b6dcebea54dc47193897548822 (diff)
6 files changed, 132 insertions, 69 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 6efb383fa..450cc4512 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -897,19 +897,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0";
-                    if (kBaseTextureTypes[tt].coordCount == 1)
-                    {
-                        sb << ", $2";
-                    }
-                    else
+                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                     {
-                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                        sb << ", ($2)";
+                        if (kBaseTextureTypes[tt].coordCount > 1)
                         {
-                            sb << ", ($2)." << char(i + 'x');
+                            sb << '.' << char(i + 'x');
                         }
                     }
                     sb << ")\")\n";
                 }
+                else
+                {
+                    sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
+                }
 
                 sb << "T Sample(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n";
@@ -1028,7 +1029,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n";
                 }
 
-
+                // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on
+                // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy.
+                // I could just assume 0 for elements not supplied, and ignore z. For now will just leave                  
                 sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n";
                 sb << "T SampleGrad(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
@@ -1053,23 +1056,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 // CUDA
                 if (!isArray)
                 {
-                    sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
-                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                    if( baseShape != TextureFlavor::Shape::ShapeCube )
                     {
-                        sb << ", $2";
-                        if (kBaseTextureTypes[tt].coordCount > 1)
+                        sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
+                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                         {
-                            sb << '.' << char(i + 'x');
+                            sb << ", ($2)";
+                            if (kBaseTextureTypes[tt].coordCount > 1)
+                            {
+                                sb << '.' << char(i + 'x');
+                            }
                         }
+                        sb << ", $3)\")\n";
+                    }
+                    else
+                    {
+                        sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
                     }
-                    sb << ", $3)\")\n";
                 }
 
                 sb << "T SampleLevel(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
                 sb << "float level);\n";
 
-                
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n";
@@ -1145,6 +1154,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     EMIT_LINE_DIRECTIVE();
 
                     sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n";
+                    if (kBaseTextureTypes[tt].coordCount == 2)
+                    {
+                        // Gather only works on 2D in CUDA
+                        // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4."
+                        sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n";
+                    }
                     sb << outputType << " Gather" << componentName << "(SamplerState s, ";
                     sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n";
 
diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h
index 5f185ca8f..cca8f2e51 100644
--- a/source/slang/core.meta.slang.h
+++ b/source/slang/core.meta.slang.h
@@ -918,19 +918,20 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "D<$T0>($0";
-                    if (kBaseTextureTypes[tt].coordCount == 1)
-                    {
-                        sb << ", $2";
-                    }
-                    else
+                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                     {
-                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                        sb << ", ($2)";
+                        if (kBaseTextureTypes[tt].coordCount > 1)
                         {
-                            sb << ", ($2)." << char(i + 'x');
+                            sb << '.' << char(i + 'x');
                         }
                     }
                     sb << ")\")\n";
                 }
+                else
+                {
+                    sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
+                }
 
                 sb << "T Sample(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location);\n";
@@ -1049,7 +1050,9 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     sb << "constexpr int" << kBaseTextureTypes[tt].coordCount << " offset);\n";
                 }
 
-
+                // TODO(JS): Not clear how to map this to CUDA, because in HLSL, the gradient is a vector based on
+                // the dimension. On CUDA there is texNDGrad, but it always just takes ddx, ddy.
+                // I could just assume 0 for elements not supplied, and ignore z. For now will just leave                  
                 sb << "__target_intrinsic(glsl, \"$ctextureGrad($p, $2, $3, $4)$z\")\n";
                 sb << "T SampleGrad(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
@@ -1074,23 +1077,29 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                 // CUDA
                 if (!isArray)
                 {
-                    sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
-                    for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
+                    if( baseShape != TextureFlavor::Shape::ShapeCube )
                     {
-                        sb << ", $2";
-                        if (kBaseTextureTypes[tt].coordCount > 1)
+                        sb << "__target_intrinsic(cuda, \"tex" << kBaseTextureTypes[tt].coordCount << "DLod<$T0>($0";
+                        for (int i = 0; i < kBaseTextureTypes[tt].coordCount; ++i)
                         {
-                            sb << '.' << char(i + 'x');
+                            sb << ", ($2)";
+                            if (kBaseTextureTypes[tt].coordCount > 1)
+                            {
+                                sb << '.' << char(i + 'x');
+                            }
                         }
+                        sb << ", $3)\")\n";
+                    }
+                    else
+                    {
+                        sb << "__target_intrinsic(cuda, \"texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)\")\n";
                     }
-                    sb << ", $3)\")\n";
                 }
 
                 sb << "T SampleLevel(SamplerState s, ";
                 sb << "float" << kBaseTextureTypes[tt].coordCount + isArray << " location, ";
                 sb << "float level);\n";
 
-                
                 if( baseShape != TextureFlavor::Shape::ShapeCube )
                 {
                     sb << "__target_intrinsic(glsl, \"$ctextureLodOffset($p, $2, $3, $4)$z\")\n";
@@ -1166,6 +1175,12 @@ for (int tt = 0; tt < kBaseTextureTypeCount; ++tt)
                     EMIT_LINE_DIRECTIVE();
 
                     sb << "__target_intrinsic(glsl, \"textureGather($p, $2, " << componentIndex << ")\")\n";
+                    if (kBaseTextureTypes[tt].coordCount == 2)
+                    {
+                        // Gather only works on 2D in CUDA
+                        // "It is based on the base type of DataType except when readMode is equal to cudaReadModeNormalizedFloat (see Texture Reference API), in which case it is always float4."
+                        sb << "__target_intrinsic(cuda, \"tex2Dgather<$T0>($0, ($2).x, ($2).y, " << componentIndex << ")\")\n";
+                    }
                     sb << outputType << " Gather" << componentName << "(SamplerState s, ";
                     sb << "float" << kBaseTextureTypes[tt].coordCount << " location);\n";
 
@@ -1299,7 +1314,7 @@ for (auto op : binaryOps)
         sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n";
     }
 }
-SLANG_RAW("#line 1281 \"core.meta.slang\"")
+SLANG_RAW("#line 1296 \"core.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Specialized function\n")
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index f8ae340bc..f7707cc6d 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -347,8 +347,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> bool all(matrix<T,N,M> x)
 
 // Barrier for writes to all memory spaces (HLSL SM 5.0)
 __target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
-// TODO(JS): Doesn't seem to be weaker form of sync, so use this?
-__target_intrinsic(cuda, "__syncthreads()")
+__target_intrinsic(cuda, "__threadfence()")
 void AllMemoryBarrier();
 
 // Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)
@@ -648,6 +647,7 @@ __generic<T : __BuiltinFloatingPointType, let N : int> T determinant(matrix<T,N,
 
 // Barrier for device memory
 __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
+__target_intrinsic(cuda, "__threadfence()")
 void DeviceMemoryBarrier();
 
 __target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()")
@@ -814,6 +814,7 @@ float2 GetRenderTargetSamplePosition(int Index);
 
 // Group memory barrier
 __target_intrinsic(glsl, "groupMemoryBarrier")
+__target_intrinsic(cuda, "__threadfence_block")
 void GroupMemoryBarrier();
 
 
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index 215d18670..c0b875df5 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -396,8 +396,7 @@ SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> bool all(matri
 SLANG_RAW("\n")
 SLANG_RAW("// Barrier for writes to all memory spaces (HLSL SM 5.0)\n")
 SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n")
-SLANG_RAW("// TODO(JS): Doesn't seem to be weaker form of sync, so use this?\n")
-SLANG_RAW("__target_intrinsic(cuda, \"__syncthreads()\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n")
 SLANG_RAW("void AllMemoryBarrier();\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)\n")
@@ -724,6 +723,7 @@ SLANG_RAW("__generic<T : __BuiltinFloatingPointType, let N : int> T determinant(
 SLANG_RAW("\n")
 SLANG_RAW("// Barrier for device memory\n")
 SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence()\")\n")
 SLANG_RAW("void DeviceMemoryBarrier();\n")
 SLANG_RAW("\n")
 SLANG_RAW("__target_intrinsic(glsl, \"memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()\")\n")
@@ -890,6 +890,7 @@ SLANG_RAW("float2 GetRenderTargetSamplePosition(int Index);\n")
 SLANG_RAW("\n")
 SLANG_RAW("// Group memory barrier\n")
 SLANG_RAW("__target_intrinsic(glsl, \"groupMemoryBarrier\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__threadfence_block\")\n")
 SLANG_RAW("void GroupMemoryBarrier();\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
@@ -1641,7 +1642,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1568 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
diff --git a/source/slang/slang-ir-type-set.cpp b/source/slang/slang-ir-type-set.cpp
index a4ebf8242..e5271698c 100644
--- a/source/slang/slang-ir-type-set.cpp
+++ b/source/slang/slang-ir-type-set.cpp
@@ -115,50 +115,74 @@ IRInst* IRTypeSet::cloneInst(IRInst* inst)
             clone = m_builder.getStringValue(stringLit->getStringSlice());
             break;
         }
-        default:
+        case kIROp_VectorType:
         {
-            if (IRBasicType::isaImpl(inst->op))
+            auto vecType = static_cast<IRVectorType*>(inst);
+            const Index elementCount = Index(GetIntVal(vecType->getElementCount()));
+
+            if (elementCount <= 1)
             {
-                clone = m_builder.getType(inst->op);
+                clone = cloneType(vecType->getElementType());
             }
-            else
+            break;
+        }
+        case kIROp_MatrixType:
+        {
+            auto matType = static_cast<IRMatrixType*>(inst);
+            const Index columnCount = Index(GetIntVal(matType->getColumnCount()));
+            const Index rowCount = Index(GetIntVal(matType->getRowCount()));
+
+            if (columnCount <= 1 && rowCount <= 1)
             {
-                IRType* irType = dynamicCast<IRType>(inst);
-                if (irType)
-                {
-                    auto clonedType = cloneType(inst->getFullType());
-                    Index operandCount = Index(inst->getOperandCount());
+                clone = cloneType(matType->getElementType());
+            }
+            break;
+        }
+        default: break;
+    }
+
+    if (!clone)
+    {
+        if (IRBasicType::isaImpl(inst->op))
+        {
+            clone = m_builder.getType(inst->op);
+        }
+        else
+        {
+            IRType* irType = dynamicCast<IRType>(inst);
+            if (irType)
+            {
+                auto clonedType = cloneType(inst->getFullType());
+                Index operandCount = Index(inst->getOperandCount());
 
-                    List<IRInst*> cloneOperands;
-                    cloneOperands.setCount(operandCount);
+                List<IRInst*> cloneOperands;
+                cloneOperands.setCount(operandCount);
 
-                    for (Index i = 0; i < operandCount; ++i)
-                    {
-                        cloneOperands[i] = cloneInst(inst->getOperand(i));
-                    }
+                for (Index i = 0; i < operandCount; ++i)
+                {
+                    cloneOperands[i] = cloneInst(inst->getOperand(i));
+                }
 
-                    //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer());
+                //clone = m_irBuilder.findOrEmitHoistableInst(cloneType, inst->op, operandCount, cloneOperands.getBuffer());
 
-                    UInt operandCounts[1] = { UInt(operandCount) };
-                    IRInst*const* listOperands[1] = { cloneOperands.getBuffer() };
+                UInt operandCounts[1] = { UInt(operandCount) };
+                IRInst*const* listOperands[1] = { cloneOperands.getBuffer() };
 
-                    clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands);
-                }
-                else
+                clone = m_builder.findOrAddInst(clonedType, inst->op, 1, operandCounts, listOperands);
+            }
+            else
+            {
+                // This cloning style only works on insts that are not unique
+                auto clonedType = cloneType(inst->getFullType());
+
+                Index operandCount = Index(inst->getOperandCount());
+                clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr);
+                for (Index i = 0; i < operandCount; ++i)
                 {
-                    // This cloning style only works on insts that are not unique
-                    auto clonedType = cloneType(inst->getFullType());
-
-                    Index operandCount = Index(inst->getOperandCount());
-                    clone = m_builder.emitIntrinsicInst(clonedType, inst->op, operandCount, nullptr);
-                    for (Index i = 0; i < operandCount; ++i)
-                    {
-                        auto cloneOperand = cloneInst(inst->getOperand(i));
-                        clone->getOperands()[i].init(clone, cloneOperand);
-                    }
+                    auto cloneOperand = cloneInst(inst->getOperand(i));
+                    clone->getOperands()[i].init(clone, cloneOperand);
                 }
             }
-            break;
         }
     }
 
@@ -226,6 +250,10 @@ void IRTypeSet::getTypes(Kind kind, List<IRType*>& outTypes) const
 IRType* IRTypeSet::addVectorType(IRType* inElementType, int colsCount)
 {
     IRType* elementType = cloneType(inElementType);
+    if (colsCount == 1)
+    {
+        return elementType;
+    }
     return m_builder.getVectorType(elementType, m_builder.getIntValue(m_builder.getIntType(), colsCount));
 }
 
diff --git a/source/slang/slang-ir-type-set.h b/source/slang/slang-ir-type-set.h
index 09abdf2ad..958d71cf1 100644
--- a/source/slang/slang-ir-type-set.h
+++ b/source/slang/slang-ir-type-set.h
@@ -34,7 +34,10 @@ works, but probably needs to be handled in a better way. The better way may invo
 enabled in other code generation and making de-duping possible in emit code.
 
 Note that one pro for this approach is that it does not alter the source module. That as it stands it's not necessary
-for the source module to be immutable, because it is created for emitting and then discarded. 
+for the source module to be immutable, because it is created for emitting and then discarded.
+
+NOTE! That Vector<X, 1> or Matrix<X, 1, 1> will be turned into the type X.
+
  */
 class IRTypeSet
 {
author	jsmall-nvidia <jsmall@nvidia.com>	2020-02-14 15:06:35 -0500
committer	GitHub <noreply@github.com>	2020-02-14 15:06:35 -0500
commit	2c097545eaa324a91a035327abad2e8b4fa60469 (patch)
tree	95fd3890f2bfb0184ddbc7f1008de30698651473 /source
parent	dfd3d263704445b6dcebea54dc47193897548822 (diff)