diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-08-26 14:38:24 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-26 11:38:24 -0700 |
| commit | 2dc1f89fb069decb93dbe950fed9665453303550 (patch) | |
| tree | e80b20d002952b4531a563c4790a1aa506f5df16 /source | |
| parent | b8702dfb6d0e41515fa0f9f899d86b7935dfc3fd (diff) | |
Added more Atomic support for int64 types on RWByteAddressBuffer (#1515)
* Support for more 64 bit atomics on ByteAddressBuffer.
* min max 64bit test.
* Disable CUDA version of min max 64 bit test - as produces the wrong output.
* Update target-compatibility.md with added 64 bit atomics.
Co-authored-by: Yong He <yonghe@outlook.com>
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 135 |
1 files changed, 131 insertions, 4 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 36fe9f216..3a574bc42 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -80,7 +80,7 @@ __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) int64_t __atomicAdd(__ref int64_t value, int64_t amount); -// CAS - Compare and swap +// Cas - Compare and swap // Helper for HLSL, using NVAPI @@ -92,6 +92,56 @@ __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue); +// Max + +__target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)") +uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value); + +__target_intrinsic(glsl, "atomicMax($0, $1)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value); + +// Min + +__target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)") +uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value); + +__target_intrinsic(glsl, "atomicMin($0, $1)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value); + +// And + +__target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)") +uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value); + +__target_intrinsic(glsl, "atomicAnd($0, $1)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value); + +// Or + +__target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)") +uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value); + +__target_intrinsic(glsl, "atomicOr($0, $1)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value); + +// Xor + +__target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)") +uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value); + +__target_intrinsic(glsl, "atomicXor($0, $1)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value); + // Conversion between uint64_t and uint2 uint2 __asuint2(uint64_t i) @@ -219,7 +269,7 @@ struct $(item.name) } ${{{{ if (item.op == kIROp_HLSLRWByteAddressBufferType) - { + { }}}} // float32 and int64 atomic support. This is a Slang specific extension, it uses @@ -230,7 +280,7 @@ ${{{{ // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html - // Fp32 Add + // F32 Add __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) @@ -294,7 +344,7 @@ ${{{{ __atomicAdd(buf[byteAddress / 8], valueToAdd); } - // CAS UInt64 + // Cas uint64_t __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))") void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue); @@ -312,6 +362,83 @@ ${{{{ outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); } + // Max + + __cuda_sm_version(3.5) + __target_intrinsic(cuda, "atomicMax((uint64_t*)$0._getPtrAt($1), $2)") + uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value); + + __specialized_for_target(hlsl) + uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); } + + __specialized_for_target(glsl) + uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) + { + RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + return __atomicMax(buf[byteAddress / 8], value); + } + + // Min + + __cuda_sm_version(3.5) + __target_intrinsic(cuda, "atomicMin((uint64_t*)$0._getPtrAt($1), $2)") + uint64_t InterlockedMinU64(uint byteAddress, uint64_t value); + + __specialized_for_target(hlsl) + uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); } + + __specialized_for_target(glsl) + uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) + { + RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + return __atomicMin(buf[byteAddress / 8], value); + } + + // And + + __target_intrinsic(cuda, "atomicAnd((uint64_t*)$0._getPtrAt($1), $2)") + uint64_t InterlockedAndU64(uint byteAddress, uint64_t value); + + __specialized_for_target(hlsl) + uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); } + + __specialized_for_target(glsl) + uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) + { + RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + return __atomicAnd(buf[byteAddress / 8], value); + } + + // Or + + __target_intrinsic(cuda, "atomicOr((uint64_t*)$0._getPtrAt($1), $2)") + uint64_t InterlockedOrU64(uint byteAddress, uint64_t value); + + __specialized_for_target(hlsl) + uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); } + + __specialized_for_target(glsl) + uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) + { + RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + return __atomicOr(buf[byteAddress / 8], value); + } + + // Xor + + __target_intrinsic(cuda, "atomicXor((uint64_t*)$0._getPtrAt($1), $2)") + uint64_t InterlockedXorU64(uint byteAddress, uint64_t value); + + __specialized_for_target(hlsl) + uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); } + + __specialized_for_target(glsl) + uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) + { + RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + return __atomicXor(buf[byteAddress / 8], value); + } + ${{{{ } }}}} |
