From efeda20ec280771348887ae4eb498a8b158c9c0c Mon Sep 17 00:00:00 2001 From: Yong He Date: Thu, 30 Mar 2023 14:34:54 -0700 Subject: Fix stdlib definitions for tensor interlocked methods. (#2761) Co-authored-by: Yong He --- source/slang/diff.meta.slang | 105 ++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 52 deletions(-) (limited to 'source') diff --git a/source/slang/diff.meta.slang b/source/slang/diff.meta.slang index 252b6f5e9..2bdaccee3 100644 --- a/source/slang/diff.meta.slang +++ b/source/slang/diff.meta.slang @@ -74,12 +74,12 @@ struct TensorView __target_intrinsic(cuda, "$0.store<$G0>($1, $2, $3, $4, $5, $6)") void store(uint i0, uint i1, uint i2, uint i3, uint i4, T val); - __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") - T InterlockedAdd(uint index, T val); + __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAdd(uint index, T val, out T oldVal); __generic - __target_intrinsic(cuda, "atomicAdd($0.data_ptr_at<$TR>($1), $2)") - T InterlockedAdd(vector index, T val); + __target_intrinsic(cuda, "*($3) = atomicAdd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAdd(vector index, T val, out T oldVal); __target_intrinsic(cuda, "$0.dimensionCount") [__readNone] @@ -159,61 +159,55 @@ for (auto atomicIntegerTypeName : kCudaAtomicIntegerTypes) extension TensorView<$(atomicIntegerTypeName)> { typealias __Element = $(atomicIntegerTypeName); - __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedIncrement(uint index, __Element val); - __generic - __target_intrinsic(cuda, "atomicInc($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedIncrement(vector index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMin(uint index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMin(uint index, __Element val); - - __generic - __target_intrinsic(cuda, "atomicMin($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMin(vector index, __Element val); + __generic + __target_intrinsic(cuda, "*($3) = atomicMin($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMin(vector index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMax(uint index, __Element val); - - __generic - __target_intrinsic(cuda, "atomicMax($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedMax(vector index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMax(uint index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedAnd(uint index, __Element val); - - __generic - __target_intrinsic(cuda, "atomicAnd($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedAnd(vector index, __Element val); + __generic + __target_intrinsic(cuda, "*($3) = atomicMax($0.data_ptr_at<$T2>($1), $2)") + void InterlockedMax(vector index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedOr(uint index, __Element val); - - __generic - __target_intrinsic(cuda, "atomicOr($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedOr(vector index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAnd(uint index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedXor(uint index, __Element val); - - __generic - __target_intrinsic(cuda, "atomicXor($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedXor(vector index, __Element val); - - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedExchange(uint index, __Element val); + __generic + __target_intrinsic(cuda, "*($3) = atomicAnd($0.data_ptr_at<$T2>($1), $2)") + void InterlockedAnd(vector index, __Element val, out __Element oldVal); + + __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)") + void InterlockedOr(uint index, __Element val, out __Element oldVal); + + __generic + __target_intrinsic(cuda, "*($3) = atomicOr($0.data_ptr_at<$T2>($1), $2)") + void InterlockedOr(vector index, __Element val, out __Element oldVal); + + __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)") + void InterlockedXor(uint index, __Element val, out __Element oldVal); + + __generic + __target_intrinsic(cuda, "*($3) = atomicXor($0.data_ptr_at<$T2>($1), $2)") + void InterlockedXor(vector index, __Element val, out __Element oldVal); + + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)") + void InterlockedExchange(uint index, __Element va, out __Element oldVall); __generic - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$TR>($1), $2)") - __Element InterlockedExchange(vector index, __Element val); + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at<$T2>($1), $2)") + void InterlockedExchange(vector index, __Element val, out __Element oldVal); - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") - __Element InterlockedCompareExchange(uint index, __Element compare, __Element val); + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)") + void InterlockedCompareExchange(uint index, __Element compare, __Element val); __generic - __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$TR>($1), $2, $3)") - __Element InterlockedCompareExchange(vector index, __Element compare, __Element val); + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at<$T2>($1), $2, $3)") + void InterlockedCompareExchange(vector index, __Element compare, __Element val); } ${{{{ @@ -222,12 +216,19 @@ ${{{{ extension TensorView { - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") - float InterlockedExchange(uint index, float val); + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at($1), $2)") + float InterlockedExchange(uint index, float val, out float oldVal); __generic - __target_intrinsic(cuda, "atomicExch($0.data_ptr_at<$G0>($1), $2)") - float InterlockedExchange(vector index, float val); + __target_intrinsic(cuda, "*($3) = atomicExch($0.data_ptr_at($1), $2)") + float InterlockedExchange(vector index, float val, out float oldVal); + + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at($1), slang_bit_cast($2), slang_bit_cast($3))") + void InterlockedCompareExchange(uint index, float compare, float val); + + __generic + __target_intrinsic(cuda, "atomicCAS($0.data_ptr_at($1), slang_bit_cast($2), slang_bit_cast($3))") + void InterlockedCompareExchange(vector index, float compare, float val); } __generic -- cgit v1.2.3