From 4804753d4a2ec389cc6ecd759f7ea712848fddf0 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Mon, 24 Aug 2020 15:23:40 -0400 Subject: RWByteAddressBuffer::InterlockedCompareExchangeU64 (#1513) * First pass at incorporating nvapi into test harness. * D3d12 Atomic Float Add via NVAPI working * Dx12 atomic float appears to work. * Atomic float add on Dx12. * Added atomic64 feature addition to vk. Fix correct output for atomic-float-byte-address.slang * Disable atomic float failing tests. * Upgraded VK headers. * Detect atomic float availability on VK. * Try to get test working for in64 atomic. * Made HLSL prelude controlled via the render-test requirements. * Added -enable-nvapi to premake. * Fix D3D12Renderer when NVAPI is not available. * Small improvements to VKRenderer. * Improve atomic documentation in target-compatibility.md. * Fixed NVAPI working on D3D12. * Test for specific NVAPI features. * Remove requiredFeatures from Renderer::Desc as was ignored. Tried to document more around nvapiExtnSlot. * Readded requiredFeatures to Renderer::Desc * Improve comments in the tests. * Rename Fp32 -> F32 Added cas-int64-byte-address-buffer.slang test Co-authored-by: Tim Foley --- source/slang/hlsl.meta.slang | 77 ++++++++++++++++++++++++++++++++------------ 1 file changed, 57 insertions(+), 20 deletions(-) (limited to 'source') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 8ec5c2c67..36fe9f216 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -48,6 +48,7 @@ struct ByteAddressBuffer } }; +// AtomicAdd // Make the GLSL atomicAdd available. // We have separate int/float implementations, as the float version requires some specific extensions @@ -58,10 +59,11 @@ __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_float) float __atomicAdd(__ref float value, float amount); -// Helper for hlsl, using nvAPI +// Helper for hlsl, using NVAPI __target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)") uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2); + // Int versions require glsl 4.30 // https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml @@ -78,6 +80,32 @@ __glsl_version(430) __glsl_extension(GL_EXT_shader_atomic_int64) int64_t __atomicAdd(__ref int64_t value, int64_t amount); +// CAS - Compare and swap + +// Helper for HLSL, using NVAPI + +__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)") +uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value); + +__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)") +__glsl_version(430) +__glsl_extension(GL_EXT_shader_atomic_int64) +uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue); + +// Conversion between uint64_t and uint2 + +uint2 __asuint2(uint64_t i) +{ + return uint2(uint(i), uint(uint64_t(i) >> 32)); +} + +uint64_t __asuint64(uint2 i) +{ + return (uint64_t(i.y) << 32) | i.x; +} + +// + __intrinsic_op($(kIROp_ByteAddressBufferLoad)) T __byteAddressBufferLoad(ByteAddressBuffer buffer, int offset); @@ -202,16 +230,15 @@ ${{{{ // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html - // Fp32 + // Fp32 Add __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))") - void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue); + void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue); - __specialized_for_target(glsl) - void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue) + void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) { RWStructuredBuffer buf = __getEquivalentStructuredBuffer(this); originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd); @@ -222,29 +249,24 @@ ${{{{ __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))") __cuda_sm_version(2.0) __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)") - void InterlockedAddFp32(uint byteAddress, float valueToAdd); + void InterlockedAddF32(uint byteAddress, float valueToAdd); __specialized_for_target(glsl) - void InterlockedAddFp32(uint byteAddress, float valueToAdd) + void InterlockedAddF32(uint byteAddress, float valueToAdd) { RWStructuredBuffer buf = __getEquivalentStructuredBuffer(this); __atomicAdd(buf[byteAddress / 4], valueToAdd); } - // Int64 + // Int64 Add __cuda_sm_version(6.0) __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))") void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue); __specialized_for_target(hlsl) - void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd, out int64_t outOriginalValue) + void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue) { - uint2 valueToAdd; - valueToAdd.x = uint(inValueToAdd); - valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); - - const uint2 originalValue = __atomicAdd(this, byteAddress, valueToAdd); - outOriginalValue = (int64_t(originalValue.y) << 32) | originalValue.x; + outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd))); } __specialized_for_target(glsl) @@ -260,12 +282,9 @@ ${{{{ void InterlockedAddI64(uint byteAddress, int64_t valueToAdd); __specialized_for_target(hlsl) - void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd) + void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) { - uint2 valueToAdd; - valueToAdd.x = uint(inValueToAdd); - valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); - __atomicAdd(this, byteAddress, valueToAdd); + __atomicAdd(this, byteAddress, __asuint2(valueToAdd)); } __specialized_for_target(glsl) @@ -275,6 +294,24 @@ ${{{{ __atomicAdd(buf[byteAddress / 8], valueToAdd); } + // CAS UInt64 + + __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))") + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue); + + __specialized_for_target(hlsl) + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) + { + outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value))); + } + + __specialized_for_target(glsl) + void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue) + { + RWStructuredBuffer buf = __getEquivalentStructuredBuffer(this); + outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value); + } + ${{{{ } }}}} -- cgit v1.2.3