diff options
| author | jsmall-nvidia <jsmall@nvidia.com> | 2020-08-24 15:23:40 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-24 15:23:40 -0400 |
| commit | 4804753d4a2ec389cc6ecd759f7ea712848fddf0 (patch) | |
| tree | 0ac88b3da2aae7842cb8f71f55b79412716e8a60 /source/slang | |
| parent | 67ca54997d445e15891965b8d77561b9d10bb18c (diff) | |
RWByteAddressBuffer::InterlockedCompareExchangeU64 (#1513)
* First pass at incorporating nvapi into test harness.
* D3d12 Atomic Float Add via NVAPI working
* Dx12 atomic float appears to work.
* Atomic float add on Dx12.
* Added atomic64 feature addition to vk.
Fix correct output for atomic-float-byte-address.slang
* Disable atomic float failing tests.
* Upgraded VK headers.
* Detect atomic float availability on VK.
* Try to get test working for in64 atomic.
* Made HLSL prelude controlled via the render-test requirements.
* Added -enable-nvapi to premake.
* Fix D3D12Renderer when NVAPI is not available.
* Small improvements to VKRenderer.
* Improve atomic documentation in target-compatibility.md.
* Fixed NVAPI working on D3D12.
* Test for specific NVAPI features.
* Remove requiredFeatures from Renderer::Desc as was ignored. Tried to document more around nvapiExtnSlot.
* Readded requiredFeatures to Renderer::Desc
* Improve comments in the tests.
* Rename Fp32 -> F32
Added cas-int64-byte-address-buffer.slang test
Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
Diffstat (limited to 'source/slang')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 77 |
1 files changed, 57 insertions, 20 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 8ec5c2c67..36fe9f216 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -48,6 +48,7 @@ struct ByteAddressBuffer } }; +// AtomicAdd // Make the GLSL atomicAdd available. // We have separate int/float implementations, as the float version requires some specific extensions @@ -58,10 +59,11 @@ __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_float) float __atomicAdd(__ref float value, float amount); -// Helper for hlsl, using nvAPI +// Helper for hlsl, using NVAPI __target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)") uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2); + // Int versions require glsl 4.30 // https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml @@ -78,6 +80,32 @@ __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) int64_t __atomicAdd(__ref int64_t value, int64_t amount); +// CAS - Compare and swap + +// Helper for HLSL, using NVAPI + +__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)") +uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value); + +__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue); + +// Conversion between uint64_t and uint2 + +uint2 __asuint2(uint64_t i) +{ + return uint2(uint(i), uint(uint64_t(i) >> 32)); +} + +uint64_t __asuint64(uint2 i) +{ + return (uint64_t(i.y) << 32) | i.x; +} + +// + __intrinsic_op($(kIROp_ByteAddressBufferLoad)) T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset); @@ -202,16 +230,15 @@ ${{{{ // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html - // Fp32 + // Fp32 Add __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))") - void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue); + void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue); - __specialized_for_target(glsl) - void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue) + void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) { RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this); originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd); @@ -222,29 +249,24 @@ ${{{{ __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)") - void InterlockedAddFp32(uint byteAddress, float valueToAdd); + void InterlockedAddF32(uint byteAddress, float valueToAdd); __specialized_for_target(glsl) - void InterlockedAddFp32(uint byteAddress, float valueToAdd) + void InterlockedAddF32(uint byteAddress, float valueToAdd) { RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this); __atomicAdd(buf[byteAddress / 4], valueToAdd); } - // Int64 + // Int64 Add __cuda_sm_version(6.0) __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))") void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue); __specialized_for_target(hlsl) - void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd, out int64_t outOriginalValue) + void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue) { - uint2 valueToAdd; - valueToAdd.x = uint(inValueToAdd); - valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); - - const uint2 originalValue = __atomicAdd(this, byteAddress, valueToAdd); - outOriginalValue = (int64_t(originalValue.y) << 32) | originalValue.x; + outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd))); } __specialized_for_target(glsl) @@ -260,12 +282,9 @@ ${{{{ void InterlockedAddI64(uint byteAddress, int64_t valueToAdd); __specialized_for_target(hlsl) - void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd) + void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) { - uint2 valueToAdd; - valueToAdd.x = uint(inValueToAdd); - valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); - __atomicAdd(this, byteAddress, valueToAdd); + __atomicAdd(this, byteAddress, __asuint2(valueToAdd)); } __specialized_for_target(glsl) @@ -275,6 +294,24 @@ ${{{{ __atomicAdd(buf[byteAddress / 8], valueToAdd); } + // CAS UInt64 + + __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))") + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue); + + __specialized_for_target(hlsl) + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) + { + outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value))); + } + + __specialized_for_target(glsl) + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) + { + RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this); + outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); + } + ${{{{ } }}}} |
