From 4804753d4a2ec389cc6ecd759f7ea712848fddf0 Mon Sep 17 00:00:00 2001
From: jsmall-nvidia <jsmall@nvidia.com>
Date: Mon, 24 Aug 2020 15:23:40 -0400
Subject: RWByteAddressBuffer::InterlockedCompareExchangeU64 (#1513)

* First pass at incorporating nvapi into test harness.

* D3d12 Atomic Float Add via NVAPI working

* Dx12 atomic float appears to work.

* Atomic float add on Dx12.

* Added atomic64 feature addition to vk.
Fix correct output for atomic-float-byte-address.slang

* Disable atomic float failing tests.

* Upgraded VK headers.

* Detect atomic float availability on VK.

* Try to get test working for in64 atomic.

* Made HLSL prelude controlled via the render-test requirements.

* Added -enable-nvapi to premake.

* Fix D3D12Renderer when NVAPI is not available.

* Small improvements to VKRenderer.

* Improve atomic documentation in target-compatibility.md.

* Fixed NVAPI working on D3D12.

* Test for specific NVAPI features.

* Remove requiredFeatures from Renderer::Desc as was ignored. Tried to document more around nvapiExtnSlot.

* Readded requiredFeatures to Renderer::Desc

* Improve comments in the tests.

* Rename Fp32 -> F32
Added cas-int64-byte-address-buffer.slang test

Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
---
 source/slang/hlsl.meta.slang | 77 ++++++++++++++++++++++++++++++++------------
 1 file changed, 57 insertions(+), 20 deletions(-)

(limited to 'source')
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8ec5c2c67..36fe9f216 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -48,6 +48,7 @@ struct ByteAddressBuffer
     }
 };
 
+// AtomicAdd
 
 // Make the GLSL atomicAdd available.
 // We have separate int/float implementations, as the float version requires some specific extensions
@@ -58,10 +59,11 @@ __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_float)
 float __atomicAdd(__ref float value, float amount);
 
-// Helper for hlsl, using nvAPI
+// Helper for hlsl, using NVAPI
 __target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
 uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2);
 
+
 // Int versions require glsl 4.30
 // https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
 
@@ -78,6 +80,32 @@ __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_int64)
 int64_t __atomicAdd(__ref int64_t value, int64_t amount);
 
+// CAS - Compare and swap
+
+// Helper for HLSL, using NVAPI
+
+__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)")
+uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value);
+
+__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue);
+
+// Conversion between uint64_t and uint2
+
+uint2 __asuint2(uint64_t i)
+{
+    return uint2(uint(i), uint(uint64_t(i) >> 32));
+}
+
+uint64_t __asuint64(uint2 i)
+{
+    return (uint64_t(i.y) << 32) | i.x;
+}
+
+// 
+
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
 
@@ -202,16 +230,15 @@ ${{{{
     // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float
     // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html
 
-    // Fp32 
+    // Fp32 Add
 
     __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
     __cuda_sm_version(2.0)
     __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
-    void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
+    void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue);
 
-    
     __specialized_for_target(glsl)
-    void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue)
+    void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue)
     {
         RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
         originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
@@ -222,29 +249,24 @@ ${{{{
     __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
     __cuda_sm_version(2.0)
     __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
-    void InterlockedAddFp32(uint byteAddress, float valueToAdd);
+    void InterlockedAddF32(uint byteAddress, float valueToAdd);
 
     __specialized_for_target(glsl)
-    void InterlockedAddFp32(uint byteAddress, float valueToAdd)
+    void InterlockedAddF32(uint byteAddress, float valueToAdd)
     {
         RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
         __atomicAdd(buf[byteAddress / 4], valueToAdd);
     }
 
-    // Int64
+    // Int64 Add
     __cuda_sm_version(6.0)
     __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))")
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
 
     __specialized_for_target(hlsl)
-    void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd, out int64_t outOriginalValue)
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
     {
-        uint2 valueToAdd;
-        valueToAdd.x = uint(inValueToAdd);
-        valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); 
-
-        const uint2 originalValue = __atomicAdd(this, byteAddress, valueToAdd);
-        outOriginalValue = (int64_t(originalValue.y) << 32) | originalValue.x;
+        outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
     }
 
     __specialized_for_target(glsl)
@@ -260,12 +282,9 @@ ${{{{
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
 
     __specialized_for_target(hlsl)
-    void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd)
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
     {
-        uint2 valueToAdd;
-        valueToAdd.x = uint(inValueToAdd);
-        valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); 
-        __atomicAdd(this, byteAddress, valueToAdd);
+        __atomicAdd(this, byteAddress, __asuint2(valueToAdd));
     }
 
     __specialized_for_target(glsl)
@@ -275,6 +294,24 @@ ${{{{
         __atomicAdd(buf[byteAddress / 8], valueToAdd);
     }
 
+    // CAS UInt64
+
+    __target_intrinsic(cuda, "(*$4 = atomicCAS((uint64_t*)$0._getPtrAt($1), $2, $3))")
+    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);
+
+    __specialized_for_target(hlsl)
+    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
+    {
+        outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value)));
+    }
+
+    __specialized_for_target(glsl)
+    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
+    }
+
 ${{{{
     }
 }}}}
-- 
cgit v1.2.3