diff options
| author | Yong He <yonghe@outlook.com> | 2023-03-30 12:50:02 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-03-30 12:50:02 -0700 |
| commit | 917416f6db7056cddff9d2a0e4e9b4117359157d (patch) | |
| tree | 9bd6aa89f235e4692cff83cdbe1ce4aae7ea861f /source | |
| parent | e3b701c9f56f4a2fb8c56a65b5c75b49ee72ca73 (diff) | |
More builtin library support in torch backend. (#2760)
Co-authored-by: Yong He <yhe@nvidia.com>
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/diff.meta.slang | 130 | ||||
| -rw-r--r-- | source/slang/slang-emit-c-like.cpp | 5 | ||||
| -rw-r--r-- | source/slang/slang-emit-c-like.h | 1 | ||||
| -rw-r--r-- | source/slang/slang-emit-torch.cpp | 95 | ||||
| -rw-r--r-- | source/slang/slang-emit-torch.h | 2 | ||||
| -rw-r--r-- | source/slang/slang-intrinsic-expand.cpp | 27 | ||||
| -rw-r--r-- | source/slang/slang-ir-check-differentiability.cpp | 2 | ||||
| -rw-r--r-- | source/slang/slang-ir-inst-defs.h | 3 | ||||
| -rw-r--r-- | source/slang/slang-ir-insts.h | 2 | ||||
| -rw-r--r-- | source/slang/slang-ir-pytorch-cpp-binding.cpp | 4 | ||||
| -rw-r--r-- | source/slang/slang-ir.cpp | 5 | ||||
| -rw-r--r-- | source/slang/slang-stdlib.cpp | 3 | ||||
| -rw-r--r-- | source/slang/slang.cpp | 1 |
13 files changed, 215 insertions, 65 deletions
diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang index 51cf1cdb7..252b6f5e9 100644 --- a/source/slang/diff.meta.slang +++ b/source/slang/diff.meta.slang @@ -34,6 +34,15 @@ struct TensorView [__readNone] Ptr<T> data_ptr(); + __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)") + [__readNone] + Ptr<T> data_ptr_at(uint index); + + __generic<let N: int> + __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)") + [__readNone] + Ptr<T> data_ptr_at(vector<uint, N> index); + __implicit_conversion($(kConversionCost_ImplicitDereference)) __intrinsic_op($(kIROp_TorchTensorGetView)) __init(TorchTensor<T> t); @@ -65,6 +74,13 @@ struct TensorView __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)") void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val); + __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") + T InterlockedAdd(uint index, T val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") + T InterlockedAdd(vector<uint, N> index, T val); + __target_intrinsic(cuda, "$0.dimensionCount") [__readNone] uint dims(); @@ -81,44 +97,139 @@ struct TensorView { [ForceInline] [__readNone] get { return load(index); } [ForceInline] set { store(index, newValue); } + + __target_intrinsic(cuda, "$0.load<$G0>($1)") + ref; } __subscript(uint i1, uint i2) -> T { [ForceInline] [__readNone] get { return load(i1, i2); } [ForceInline] set { store(i1, i2, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2)") + ref; } __subscript(uint2 i) -> T { [ForceInline] [__readNone] get { return load(i.x, i.y); } [ForceInline] set { store(i.x, i.y, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y)") + ref; } __subscript(uint i1, uint i2, uint i3) -> T { [ForceInline] [__readNone] get { return load(i1, i2, i3); } [ForceInline] set { store(i1, i2, i3, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3)") + ref; } __subscript(uint3 i) -> T { [ForceInline] [__readNone] get { return load(i.x, i.y, i.z); } [ForceInline] set { store(i.x, i.y, i.z, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y, $1.z)") + ref; } __subscript(uint i1, uint i2, uint i3, uint i4) -> T { [ForceInline] [__readNone] get { return load(i1, i2, i3, i4); } [ForceInline] set { store(i1, i2, i3, i4, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4)") + ref; } __subscript(uint4 i) -> T { [__readNone][ForceInline] get { return load(i.x, i.y, i.z, i.w); } [ForceInline] set { store(i.x, i.y, i.z, i.w, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y, $1.z, $1.w)") + ref; } __subscript(uint i1, uint i2, uint i3, uint i4, uint i5) -> T { [ForceInline] [__readNone] get { return load(i1, i2, i3, i4, i5); } [ForceInline] set { store(i1, i2, i3, i4, i5, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4, $5)") + ref; } } +${{{{ +for (auto atomicIntegerTypeName : kCudaAtomicIntegerTypes) +{ +}}}} +extension TensorView<$(atomicIntegerTypeName)> +{ + typealias __Element = $(atomicIntegerTypeName); + __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedIncrement(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedIncrement(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMin(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMin(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMax<T>(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMax(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedAnd<T>(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedAnd(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedOr<T>(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedOr(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedXor<T>(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedXor(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedExchange(uint index, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedExchange(vector<uint, N> index, __Element val); + + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") + __Element InterlockedCompareExchange(uint index, __Element compare, __Element val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") + __Element InterlockedCompareExchange(vector<uint, N> index, __Element compare, __Element val); +} + +${{{{ +} // end for atomicIntegerTypeName +}}}} + +extension TensorView<float> +{ + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") + float InterlockedExchange(uint index, float val); + + __generic<let N:int> + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") + float InterlockedExchange(vector<uint, N> index, float val); +} + __generic<T> __intrinsic_type($(kIROp_TorchTensorType)) struct TorchTensor @@ -162,9 +273,26 @@ struct TorchTensor static TorchTensor<T> alloc(uint i0, uint i1, uint i2, uint i3, uint i4); __intrinsic_op($(kIROp_AllocateTorchTensor)) - static TorchTensor<T> zerosLike(TorchTensor<T> other); + static TorchTensor<T> emptyLike(TorchTensor<T> other); + + __target_intrinsic(cpp, "$0.zero_()") + void fillZero(); + + __target_intrinsic(cpp, "$0.fill_($1)") + void fillValue(T val); + + static TorchTensor<T> zerosLike(TorchTensor<T> other) + { + var result = emptyLike(other); + result.fillZero(); + return result; + } + } +__target_intrinsic(cpp, "AT_CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()))") +void syncTorchCudaStream(); + __generic<T: IDifferentiable> __intrinsic_op($(kIROp_MakeDifferentialPairUserCode)) DifferentialPair<T> diffPair(T primal, T.Differential diff); diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp index 08ba050db..166c131d5 100644 --- a/source/slang/slang-emit-c-like.cpp +++ b/source/slang/slang-emit-c-like.cpp @@ -2405,6 +2405,11 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst) m_writer->emit(";\n"); break; + // Insts that needs to be emitted as code blocks. + case kIROp_CudaKernelLaunch: + emitInstStmtImpl(inst); + break; + case kIROp_LiveRangeStart: case kIROp_LiveRangeEnd: emitLiveness(inst); diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h index 8046fa633..bfe3dcc4e 100644 --- a/source/slang/slang-emit-c-like.h +++ b/source/slang/slang-emit-c-like.h @@ -530,6 +530,7 @@ public: virtual bool tryEmitGlobalParamImpl(IRGlobalParam* varDecl, IRType* varType) { SLANG_UNUSED(varDecl); SLANG_UNUSED(varType); return false; } virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) { SLANG_UNUSED(inst); SLANG_UNUSED(inOuterPrec); return false; } + virtual void emitInstStmtImpl(IRInst* inst) { SLANG_UNUSED(inst); } virtual void emitPostKeywordTypeAttributesImpl(IRInst* inst) { SLANG_UNUSED(inst); } diff --git a/source/slang/slang-emit-torch.cpp b/source/slang/slang-emit-torch.cpp index 276164ed5..c198b011d 100644 --- a/source/slang/slang-emit-torch.cpp +++ b/source/slang/slang-emit-torch.cpp @@ -65,6 +65,49 @@ void emitTorchScalarTypeName(SourceWriter* m_writer, IRInst* type) } } +void TorchCppSourceEmitter::emitInstStmtImpl(IRInst* inst) +{ + switch (inst->getOp()) + { + default: + return; + case kIROp_CudaKernelLaunch: + { + m_writer->emit("AT_CUDA_CHECK(cudaLaunchKernel("); + // func + m_writer->emit("(const void*)("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit("), "); + + // gridDim + m_writer->emit("slang_bit_cast<dim3>("); + emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); + m_writer->emit("), "); + + // blockDim + m_writer->emit("slang_bit_cast<dim3>("); + emitOperand(inst->getOperand(2), getInfo(EmitOp::General)); + m_writer->emit("), "); + + // args + emitOperand(inst->getOperand(3), getInfo(EmitOp::General)); + m_writer->emit(", "); + + // shared mem + m_writer->emit("slangGetCudaKernelSharedMemSize((const void*)("); + emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); + m_writer->emit(")), "); + + // stream + m_writer->emit("((cudaStream_t)"); + emitOperand(inst->getOperand(4), getInfo(EmitOp::General)); + m_writer->emit(")));\n"); + + break; + } + } +} + bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) { switch (inst->getOp()) @@ -78,47 +121,12 @@ bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& m_writer->emit("make_tensor_view("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); m_writer->emit(", "); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit(", "); - emitStringLiteral(getUnmangledName(inst->getOperand(1))); + emitStringLiteral(getUnmangledName(inst->getOperand(0))); m_writer->emit(", "); - emitTorchScalarTypeName(m_writer, inst->getOperand(1)->getDataType()); + emitTorchScalarTypeName(m_writer, inst->getOperand(0)->getDataType()); m_writer->emit(")"); return true; } - case kIROp_CudaKernelLaunch: - { - m_writer->emit("cudaLaunchKernel("); - // func - m_writer->emit("(const void*)("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit("), "); - - // gridDim - m_writer->emit("slang_bit_cast<dim3>("); - emitOperand(inst->getOperand(1), getInfo(EmitOp::General)); - m_writer->emit("), "); - - // blockDim - m_writer->emit("slang_bit_cast<dim3>("); - emitOperand(inst->getOperand(2), getInfo(EmitOp::General)); - m_writer->emit("), "); - - // args - emitOperand(inst->getOperand(3), getInfo(EmitOp::General)); - m_writer->emit(", "); - - // shared mem - m_writer->emit("slangGetCudaKernelSharedMemSize((const void*)("); - emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(")), "); - - // stream - m_writer->emit("((cudaStream_t)"); - emitOperand(inst->getOperand(4), getInfo(EmitOp::General)); - m_writer->emit("))"); - return true; - } case kIROp_TorchGetCudaStream: { m_writer->emit("at::cuda::getCurrentCUDAStream()"); @@ -131,12 +139,14 @@ bool TorchCppSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& /* Emit something like: ``` - torch::Tensor out = torch::zeros_like(other); + torch::Tensor out = torch::empty_like(other); ``` */ - m_writer->emit("torch::zeros_like("); + m_writer->emit("torch::empty_like("); emitOperand(inst->getOperand(0), getInfo(EmitOp::General)); - m_writer->emit(")"); + m_writer->emit(", torch::TensorOptions().device(torch::kCUDA).dtype("); + emitTorchScalarTypeName(m_writer, inst->getDataType()); + m_writer->emit("))"); } else { @@ -180,11 +190,6 @@ SlangResult TorchCppSourceEmitter::calcTypeName(IRType* type, CodeGenTarget targ out << "torch::Tensor"; return SLANG_OK; } - case kIROp_TorchKernelMemoryAllocatorType: - { - out << "CudaTaskMemoryAllocator"; - return SLANG_OK; - } } } diff --git a/source/slang/slang-emit-torch.h b/source/slang/slang-emit-torch.h index 84ce42331..aeb9058a4 100644 --- a/source/slang/slang-emit-torch.h +++ b/source/slang/slang-emit-torch.h @@ -19,6 +19,8 @@ public: protected: // CPPSourceEmitter overrides + virtual void emitInstStmtImpl(IRInst* inst) override; + virtual bool tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOuterPrec) override; virtual SlangResult calcTypeName(IRType* type, CodeGenTarget target, StringBuilder& out) override; virtual void emitModuleImpl(IRModule* module, DiagnosticSink* sink) override; diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp index de3396efb..7a4744d59 100644 --- a/source/slang/slang-intrinsic-expand.cpp +++ b/source/slang/slang-intrinsic-expand.cpp @@ -268,17 +268,28 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor) break; case 'T': - // Get the the 'element' type for the type of the param at the index + // Get the 'element' or `return` type for the type of the param at the index { - SLANG_RELEASE_ASSERT(*cursor >= '0' && *cursor <= '9'); - Index argIndex = (*cursor++) - '0' + m_argIndexOffset; - SLANG_RELEASE_ASSERT(m_argCount > argIndex); - - IRType* type = m_args[argIndex].get()->getDataType(); - if (auto baseTextureType = as<IRTextureType>(type)) + IRType* type = nullptr; + if (*cursor == 'R') { - type = baseTextureType->getElementType(); + // Get the return type of the call + cursor++; + type = m_callInst->getDataType(); + } + else + { + SLANG_RELEASE_ASSERT(*cursor >= '0' && *cursor <= '9'); + Index argIndex = (*cursor++) - '0' + m_argIndexOffset; + SLANG_RELEASE_ASSERT(m_argCount > argIndex); + + type = m_args[argIndex].get()->getDataType(); + if (auto baseTextureType = as<IRTextureType>(type)) + { + type = baseTextureType->getElementType(); + } } + SLANG_RELEASE_ASSERT(type); m_emitter->emitType(type); } break; diff --git a/source/slang/slang-ir-check-differentiability.cpp b/source/slang/slang-ir-check-differentiability.cpp index c4b09d9e8..355381559 100644 --- a/source/slang/slang-ir-check-differentiability.cpp +++ b/source/slang/slang-ir-check-differentiability.cpp @@ -332,8 +332,6 @@ public: { if (expectDiffInstWorkListSet.Add(inst)) { - if (inst->getFullType() && inst->getFullType()->getOp() == kIROp_IntType) - printf("break"); expectDiffInstWorkList.add(inst); } } diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h index e58094b15..8d06c6970 100644 --- a/source/slang/slang-ir-inst-defs.h +++ b/source/slang/slang-ir-inst-defs.h @@ -69,7 +69,6 @@ INST(Nop, nop, 0, 0) INST(TensorViewType, TensorView, 1, HOISTABLE) INST(TorchTensorType, TorchTensor, 0, HOISTABLE) - INST(TorchKernelMemoryAllocatorType, TorchMemAllocatorType, 0, HOISTABLE) INST(ArrayListType, ArrayListVector, 1, HOISTABLE) /* BindExistentialsTypeBase */ @@ -614,7 +613,7 @@ INST(GetOptiXSbtDataPtr, getOptiXSbtDataPointer, 0, 0) INST(MakeArrayList, makeArrayList, 0, 0) INST(MakeTensorView, makeTensorView, 0, 0) -INST(AllocateTorchTensor, allocTorchTensor , 0, 0) +INST(AllocateTorchTensor, allocTorchTensor, 0, 0) INST(TorchGetCudaStream, TorchGetCudaStream, 0, 0) INST(TorchTensorGetView, TorchTensorGetView, 0, 0) diff --git a/source/slang/slang-ir-insts.h b/source/slang/slang-ir-insts.h index f5b03eb45..26d00d4df 100644 --- a/source/slang/slang-ir-insts.h +++ b/source/slang/slang-ir-insts.h @@ -3129,7 +3129,7 @@ public: return emitMakeStruct(type, args.getCount(), args.getBuffer()); } - IRInst* emitMakeTensorView(IRType* type, IRInst* allocator, IRInst* val); + IRInst* emitMakeTensorView(IRType* type, IRInst* val); IRInst* emitMakeExistential( IRType* type, diff --git a/source/slang/slang-ir-pytorch-cpp-binding.cpp b/source/slang/slang-ir-pytorch-cpp-binding.cpp index 971e87a6f..d59d57474 100644 --- a/source/slang/slang-ir-pytorch-cpp-binding.cpp +++ b/source/slang/slang-ir-pytorch-cpp-binding.cpp @@ -263,8 +263,6 @@ static void generateCppBindingForFunc(IRFunc* func, DiagnosticSink* sink) oldParam->removeAndDeallocate(); } - auto allocator = builder.emitVar(builder.getType(kIROp_TorchKernelMemoryAllocatorType)); - for (auto block : func->getBlocks()) { for (auto inst : block->getChildren()) @@ -297,7 +295,7 @@ static void generateCppBindingForFunc(IRFunc* func, DiagnosticSink* sink) else if (auto getView = as<IRTorchTensorGetView>(inst)) { builder.setInsertBefore(getView); - auto makeView = builder.emitMakeTensorView(getView->getFullType(), allocator, inst->getOperand(0)); + auto makeView = builder.emitMakeTensorView(getView->getFullType(), inst->getOperand(0)); getView->replaceUsesWith(makeView); instsToRemove.add(getView); } diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp index d03096483..6ca05d2d6 100644 --- a/source/slang/slang-ir.cpp +++ b/source/slang/slang-ir.cpp @@ -3951,10 +3951,9 @@ namespace Slang return emitIntrinsicInst(type, kIROp_MakeStruct, argCount, args); } - IRInst* IRBuilder::emitMakeTensorView(IRType* type, IRInst* allocator, IRInst* val) + IRInst* IRBuilder::emitMakeTensorView(IRType* type, IRInst* val) { - IRInst* args[2] = { allocator, val }; - return emitIntrinsicInst(type, kIROp_MakeTensorView, 2, args); + return emitIntrinsicInst(type, kIROp_MakeTensorView, 1, &val); } IRInst* IRBuilder::emitMakeExistential( diff --git a/source/slang/slang-stdlib.cpp b/source/slang/slang-stdlib.cpp index 17fb3d28b..9d4a079c8 100644 --- a/source/slang/slang-stdlib.cpp +++ b/source/slang/slang-stdlib.cpp @@ -244,6 +244,9 @@ namespace Slang {kIROp_Leq, "leq", "<=", "__BuiltinArithmeticType", ARITHMETIC_MASK | BOOL_RESULT}, }; + // Integer types that can be used in atomic operations in CUDA. + static const char* kCudaAtomicIntegerTypes[] = { "int", "uint", "uint64_t", "int64_t" }; + // Both the following functions use these macros. // NOTE! They require a variable named path to emit the #line correctly if in source file. #define SLANG_RAW(TEXT) sb << TEXT; diff --git a/source/slang/slang.cpp b/source/slang/slang.cpp index 79f9e56ba..b831859a5 100644 --- a/source/slang/slang.cpp +++ b/source/slang/slang.cpp @@ -1493,6 +1493,7 @@ CapabilitySet TargetRequest::getTargetCaps() break; case CodeGenTarget::CPPSource: + case CodeGenTarget::PyTorchCppBinding: case CodeGenTarget::HostExecutable: case CodeGenTarget::ShaderSharedLibrary: case CodeGenTarget::HostHostCallable: |
