From 917416f6db7056cddff9d2a0e4e9b4117359157d Mon Sep 17 00:00:00 2001 From: Yong He Date: Thu, 30 Mar 2023 12:50:02 -0700 Subject: More builtin library support in torch backend. (#2760) Co-authored-by: Yong He --- source/slang/diff.meta.slang | 130 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 129 insertions(+), 1 deletion(-) (limited to 'source/slang/diff.meta.slang') diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang index 51cf1cdb7..252b6f5e9 100644 --- a/source/slang/diff.meta.slang +++ b/source/slang/diff.meta.slang @@ -34,6 +34,15 @@ struct TensorView [__readNone] Ptr data_ptr(); + __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)") + [__readNone] + Ptr data_ptr_at(uint index); + + __generic + __target_intrinsic(cuda, "$0.data_ptr_at<$G0>($1)") + [__readNone] + Ptr data_ptr_at(vector index); + __implicit_conversion($(kConversionCost_ImplicitDereference)) __intrinsic_op($(kIROp_TorchTensorGetView)) __init(TorchTensor t); @@ -65,6 +74,13 @@ struct TensorView __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)") void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val); + __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") + T InterlockedAdd(uint index, T val); + + __generic + __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") + T InterlockedAdd(vector index, T val); + __target_intrinsic(cuda, "$0.dimensionCount") [__readNone] uint dims(); @@ -81,44 +97,139 @@ struct TensorView { [ForceInline] [__readNone] get { return load(index); } [ForceInline] set { store(index, newValue); } + + __target_intrinsic(cuda, "$0.load<$G0>($1)") + ref; } __subscript(uint i1, uint i2) -> T { [ForceInline] [__readNone] get { return load(i1, i2); } [ForceInline] set { store(i1, i2, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2)") + ref; } __subscript(uint2 i) -> T { [ForceInline] [__readNone] get { return load(i.x, i.y); } [ForceInline] set { store(i.x, i.y, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y)") + ref; } __subscript(uint i1, uint i2, uint i3) -> T { [ForceInline] [__readNone] get { return load(i1, i2, i3); } [ForceInline] set { store(i1, i2, i3, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3)") + ref; } __subscript(uint3 i) -> T { [ForceInline] [__readNone] get { return load(i.x, i.y, i.z); } [ForceInline] set { store(i.x, i.y, i.z, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y, $1.z)") + ref; } __subscript(uint i1, uint i2, uint i3, uint i4) -> T { [ForceInline] [__readNone] get { return load(i1, i2, i3, i4); } [ForceInline] set { store(i1, i2, i3, i4, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4)") + ref; } __subscript(uint4 i) -> T { [__readNone][ForceInline] get { return load(i.x, i.y, i.z, i.w); } [ForceInline] set { store(i.x, i.y, i.z, i.w, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1.x, $1.y, $1.z, $1.w)") + ref; } __subscript(uint i1, uint i2, uint i3, uint i4, uint i5) -> T { [ForceInline] [__readNone] get { return load(i1, i2, i3, i4, i5); } [ForceInline] set { store(i1, i2, i3, i4, i5, newValue); } + __target_intrinsic(cuda, "$0.load<$G0>($1, $2, $3, $4, $5)") + ref; } } +${{{{ +for (auto atomicIntegerTypeName : kCudaAtomicIntegerTypes) +{ +}}}} +extension TensorView<$(atomicIntegerTypeName)> +{ + typealias __Element = $(atomicIntegerTypeName); + __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedIncrement(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedIncrement(vector index, __Element val); + + __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMin(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMin(vector index, __Element val); + + __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMax(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedMax(vector index, __Element val); + + __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedAnd(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedAnd(vector index, __Element val); + + __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedOr(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedOr(vector index, __Element val); + + __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedXor(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedXor(vector index, __Element val); + + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedExchange(uint index, __Element val); + + __generic + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") + __Element InterlockedExchange(vector index, __Element val); + + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") + __Element InterlockedCompareExchange(uint index, __Element compare, __Element val); + + __generic + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") + __Element InterlockedCompareExchange(vector index, __Element compare, __Element val); +} + +${{{{ +} // end for atomicIntegerTypeName +}}}} + +extension TensorView +{ + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") + float InterlockedExchange(uint index, float val); + + __generic + __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") + float InterlockedExchange(vector index, float val); +} + __generic __intrinsic_type($(kIROp_TorchTensorType)) struct TorchTensor @@ -162,9 +273,26 @@ struct TorchTensor static TorchTensor alloc(uint i0, uint i1, uint i2, uint i3, uint i4); __intrinsic_op($(kIROp_AllocateTorchTensor)) - static TorchTensor zerosLike(TorchTensor other); + static TorchTensor emptyLike(TorchTensor other); + + __target_intrinsic(cpp, "$0.zero_()") + void fillZero(); + + __target_intrinsic(cpp, "$0.fill_($1)") + void fillValue(T val); + + static TorchTensor zerosLike(TorchTensor other) + { + var result = emptyLike(other); + result.fillZero(); + return result; + } + } +__target_intrinsic(cpp, "AT_CUDA_CHECK(cudaStreamSynchronize(at::cuda::getCurrentCUDAStream()))") +void syncTorchCudaStream(); + __generic __intrinsic_op($(kIROp_MakeDifferentialPairUserCode)) DifferentialPair diffPair(T primal, T.Differential diff); -- cgit v1.2.3