Added more Atomic support for int64 types on RWByteAddressBuffer (#1515)

* Support for more 64 bit atomics on ByteAddressBuffer. * min max 64bit test. * Disable CUDA version of min max 64 bit test - as produces the wrong output. * Update target-compatibility.md with added 64 bit atomics. Co-authored-by: Yong He <yonghe@outlook.com>
author: jsmall-nvidia <jsmall@nvidia.com> 2020-08-26 14:38:24 -0400
committer: GitHub <noreply@github.com> 2020-08-26 11:38:24 -0700
commit: 2dc1f89fb069decb93dbe950fed9665453303550 (patch)
tree: e80b20d002952b4531a563c4790a1aa506f5df16 /source
parent: b8702dfb6d0e41515fa0f9f899d86b7935dfc3fd (diff)
1 files changed, 131 insertions, 4 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 36fe9f216..3a574bc42 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -80,7 +80,7 @@ __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_int64)
 int64_t __atomicAdd(__ref int64_t value, int64_t amount);
 
-// CAS - Compare and swap
+// Cas - Compare and swap
 
 // Helper for HLSL, using NVAPI
 
@@ -92,6 +92,56 @@ __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_int64)
 uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue);
 
+// Max
+
+__target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)")
+uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value);
+
+__target_intrinsic(glsl, "atomicMax($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value);
+
+// Min
+
+__target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)")
+uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value);
+
+__target_intrinsic(glsl, "atomicMin($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value);
+
+// And
+
+__target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)")
+uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value);
+
+__target_intrinsic(glsl, "atomicAnd($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value);
+
+// Or
+
+__target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)")
+uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value);
+
+__target_intrinsic(glsl, "atomicOr($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value);
+
+// Xor
+
+__target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)")
+uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value);
+
+__target_intrinsic(glsl, "atomicXor($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value);
+
 // Conversion between uint64_t and uint2
 
 uint2 __asuint2(uint64_t i)
@@ -219,7 +269,7 @@ struct $(item.name)
     }
 ${{{{
     if (item.op == kIROp_HLSLRWByteAddressBufferType)
-    {
+    {  
 }}}}
 
     // float32 and int64 atomic support. This is a Slang specific extension, it uses
@@ -230,7 +280,7 @@ ${{{{
     // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float
     // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html
 
-    // Fp32 Add
+    // F32 Add
 
     __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
     __cuda_sm_version(2.0)
@@ -294,7 +344,7 @@ ${{{{
         __atomicAdd(buf[byteAddress / 8], valueToAdd);
     }
 
-    // CAS UInt64
+    // Cas uint64_t
 
     __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))")
     void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);
@@ -312,6 +362,83 @@ ${{{{
         outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
     }
 
+    // Max
+
+    __cuda_sm_version(3.5)
+    __target_intrinsic(cuda, "atomicMax((uint64_t*)$0._getPtrAt($1), $2)")
+    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value);
+
+    __specialized_for_target(hlsl)
+    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); }
+
+    __specialized_for_target(glsl)
+    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        return __atomicMax(buf[byteAddress / 8], value);
+    }
+
+    // Min
+    
+    __cuda_sm_version(3.5)
+    __target_intrinsic(cuda, "atomicMin((uint64_t*)$0._getPtrAt($1), $2)")
+    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value);
+
+    __specialized_for_target(hlsl)
+    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); }
+
+    __specialized_for_target(glsl)
+    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        return __atomicMin(buf[byteAddress / 8], value);
+    }
+
+    // And
+
+    __target_intrinsic(cuda, "atomicAnd((uint64_t*)$0._getPtrAt($1), $2)")
+    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value);
+
+    __specialized_for_target(hlsl)
+    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); }
+
+    __specialized_for_target(glsl)
+    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        return __atomicAnd(buf[byteAddress / 8], value);
+    }
+
+    // Or
+
+    __target_intrinsic(cuda, "atomicOr((uint64_t*)$0._getPtrAt($1), $2)")
+    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value);
+
+    __specialized_for_target(hlsl)
+    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); }
+
+    __specialized_for_target(glsl)
+    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        return __atomicOr(buf[byteAddress / 8], value);
+    }
+
+    // Xor
+
+    __target_intrinsic(cuda, "atomicXor((uint64_t*)$0._getPtrAt($1), $2)")
+    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value);
+
+    __specialized_for_target(hlsl)
+    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); }
+
+    __specialized_for_target(glsl)
+    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        return __atomicXor(buf[byteAddress / 8], value);
+    }
+
 ${{{{
     }
 }}}}
author	jsmall-nvidia <jsmall@nvidia.com>	2020-08-26 14:38:24 -0400
committer	GitHub <noreply@github.com>	2020-08-26 11:38:24 -0700
commit	2dc1f89fb069decb93dbe950fed9665453303550 (patch)
tree	e80b20d002952b4531a563c4790a1aa506f5df16 /source
parent	b8702dfb6d0e41515fa0f9f899d86b7935dfc3fd (diff)