4 files changed, 75 insertions, 3 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index a4f551644..9893effea 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -149,6 +149,17 @@ __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_int64)
 uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value);
 
+// Exchange
+
+__target_intrinsic(hlsl, "NvInterlockedExchangeUint64($0, $1, $2)")
+[__requiresNVAPI]
+uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value);
+
+__target_intrinsic(glsl, "atomicExchange($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value);
+
 // Conversion between uint64_t and uint2
 
 uint2 __asuint2(uint64_t i)
@@ -280,9 +291,15 @@ ${{{{
 }}}}
 
     // float32 and int64 atomic support. This is a Slang specific extension, it uses
-    // GL_EXT_shader_atomic_float on vk
+    // GL_EXT_shader_atomic_float on Vulkan
     // NvAPI support on DX
-    // NOTE! To use this feature on HLSL, the shader needs to include 'nvHLSLExtns.h' from the NvAPI SDK
+    // NOTE! To use this feature on HLSL based targets the path to 'nvHLSLExtns.h' from the NvAPI SDK must
+    // be set. That this include will be added to the *output* that is passed to a downstram compiler.
+    // Also note that you *can* include NVAPI headers in your Slang source, and directly use NVAPI functions
+    // Directly using NVAPI functions does *not* add the #include on the output
+    // Finally note you can *mix* NVAPI direct calls, and use of NVAPI intrinsics below. This doesn't cause
+    // any clashes, as Slang will emit any NVAPI function it parsed (say via a include in Slang source) with
+    // unique functions.
     // 
     // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float
     // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html
@@ -448,6 +465,21 @@ ${{{{
         return __atomicXor(buf[byteAddress / 8], value);
     }
 
+    // Exchange
+
+    __target_intrinsic(cuda, "atomicExch((uint64_t*)$0._getPtrAt($1), $2)")
+    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value);
+
+    __specialized_for_target(hlsl)
+    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); }
+
+    __specialized_for_target(glsl)
+    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value)
+    {
+        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
+        return __atomicExchange(buf[byteAddress / 8], value);
+    }
+
 ${{{{
     }
 }}}}
diff --git a/tests/slang-extension/cas-int64-byte-address-buffer.slang b/tests/slang-extension/cas-int64-byte-address-buffer.slang
index b75a9fa04..451d97e36 100644
--- a/tests/slang-extension/cas-int64-byte-address-buffer.slang
+++ b/tests/slang-extension/cas-int64-byte-address-buffer.slang
@@ -24,7 +24,7 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     int idx = (tid & 3) ^ (tid >> 2); 
 
     // Try directly reading
-    uint2 currentValue2 = outputBuffer.Load2(idx << 8);
+    uint2 currentValue2 = outputBuffer.Load2(idx << 3);
     uint64_t currentValue = uint64_t(currentValue2.y) | currentValue2.x;    
     
     while (true)
diff --git a/tests/slang-extension/exchange-int64-byte-address-buffer.slang b/tests/slang-extension/exchange-int64-byte-address-buffer.slang
new file mode 100644
index 000000000..0145d3838
--- /dev/null
+++ b/tests/slang-extension/exchange-int64-byte-address-buffer.slang
@@ -0,0 +1,32 @@
+// No atomic support on CPU
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+// No support for int64_t on DX11
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+// No support for int64_t on fxc - we need SM6.0 and dxil
+// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+// The test doesn't directly use this, but having this defined makes the 0 slot available if NVAPI is going to be used
+// Only strictly necessary on the D3D12 path
+//TEST_INPUT:ubuffer(data=[0 0 0 0 ], stride=4):name=nvapiBuffer
+RWStructuredBuffer<int> nvapiBuffer;
+
+//TEST_INPUT:ubuffer(data=[0 1 2 3 4 5 6 7]):out,name=outputBuffer
+RWByteAddressBuffer outputBuffer;
+
+// With only 4 threads there is no contention - which makes for a simple test
+// but doesn't actually test for the exchange atomicity
+[numthreads(4, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{    
+    int idx = dispatchThreadID.x;
+    
+    // Try directly reading
+    uint2 currentValue2 = outputBuffer.Load2(idx << 3);
+    uint64_t currentValue = uint64_t(currentValue2.y) | currentValue2.x;    
+    
+    uint64_t readValue = outputBuffer.InterlockedExchangeU64(idx << 3, currentValue + 1);
+}
+\ No newline at end of file
diff --git a/tests/slang-extension/exchange-int64-byte-address-buffer.slang.expected.txt b/tests/slang-extension/exchange-int64-byte-address-buffer.slang.expected.txt
new file mode 100644
index 000000000..4346d293e
--- /dev/null
+++ b/tests/slang-extension/exchange-int64-byte-address-buffer.slang.expected.txt
@@ -0,0 +1,8 @@
+2
+0
+4
+0
+6
+0
+8
+0