5 files changed, 99 insertions, 5 deletions
diff --git a/docs/target-compatibility.md b/docs/target-compatibility.md
index 31ae06055..beba1062c 100644
--- a/docs/target-compatibility.md
+++ b/docs/target-compatibility.md
@@ -187,13 +187,15 @@ Currently feature allows atomic float additions on RWByteAddressBuffer. A future
 ```
 void RWByteAddressBuffer::InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
 void RWByteAddressBuffer::InterlockedAddFp32(uint byteAddress, float valueToAdd);
+
+void RWByteAddressBuffer::InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
+void RWByteAddressBuffer::InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
 ```
 
 On HLSL based targets this functionality is achieved using [nvAPI](https://developer.nvidia.com/nvapi) based functionality. Therefore for the feature to work you must have nvAPI installed on your system. Then the 'prelude' functionality allows via the API for an include (or the text) of the relevent files. To see how to do this in practice look at the function `setSessionDefaultPrelude`. This makes the prelude for HLSL hold an include to the *absolute* path to the required include file `nvHLSLExtns.h`. As an absolute path is used, it means other includes that includes, look in the correct place without having to set up special include paths. 
 
 To use nvAPI it is nessary to specify a unordered access views (UAV) based 'u' register that will be used to communicate with nvAPI. Note! Slang does not do any special handling around this, it will be necessary for application code to ensure the UAV is either guarenteed to not collide with what Slang assigns, or it's specified (but not used) in the Slang source. The u register number has to be specified also to the nvAPI runtime library. 
 
-On Vulkan, the [`GL_EXT_shader_atomic_float`](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_EXT_shader_atomic_float.html) extension is required.
-
-
+On Vulkan, for float the [`GL_EXT_shader_atomic_float`](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/man/html/VK_EXT_shader_atomic_float.html) extension is required. For int64 the [`GL_EXT_shader_atomic_int64`](https://raw.githubusercontent.com/KhronosGroup/GLSL/master/extensions/ext/GL_EXT_shader_atomic_int64.txt) extension is required.
 
+CUDA requires SM6.0 or higher for int64 support. 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 329a73a33..46851269f 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -56,9 +56,12 @@ struct ByteAddressBuffer
 __target_intrinsic(glsl, "atomicAdd($0, $1)")
 __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_float)
-//__glsl_extension(GL_EXT_gpu_shader5)
 float __atomicAdd(__ref float value, float amount);
 
+// Helper for hlsl, using nvAPI
+__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
+uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2);
+
 // Int versions require glsl 4.30
 // https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
 
@@ -70,6 +73,10 @@ __target_intrinsic(glsl, "atomicAdd($0, $1)")
 __glsl_version(430)
 uint __atomicAdd(__ref uint value, uint amount);
 
+__target_intrinsic(glsl, "atomicAdd($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+int64_t __atomicAdd(__ref int64_t value, int64_t amount);
 
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
@@ -192,6 +199,9 @@ ${{{{
     // NvAPI support on DX
     // NOTE! To use this feature on HLSL, the shader needs to include 'nvHLSLExtns.h' from the NvAPI SDK
     //
+
+    // Fp32 
+
     __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
     __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
     void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
@@ -203,6 +213,8 @@ ${{{{
         originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
     }
 
+    // Without returning original value
+
     __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
     __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
     void InterlockedAddFp32(uint byteAddress, float valueToAdd);
@@ -214,6 +226,50 @@ ${{{{
         __atomicAdd(buf[byteAddress / 4], valueToAdd);
     }
 
+    // Int64
+    __cuda_sm_version(6.0)
+    __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))")
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
+
+    __specialized_for_target(hlsl)
+    void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd, out int64_t outOriginalValue)
+    {
+        uint2 valueToAdd;
+        valueToAdd.x = uint(inValueToAdd);
+        valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); 
+
+        const uint2 originalValue = __atomicAdd(this, byteAddress, valueToAdd);
+        outOriginalValue = (int64_t(originalValue.y) << 32) | originalValue.x;
+    }
+
+    __specialized_for_target(glsl)
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
+    {
+        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
+    }
+
+    // Without returning original value
+    __cuda_sm_version(6.0)
+    __target_intrinsic(cuda, "atomicAdd((uint64_t*)$0._getPtrAt($1), $2)")
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
+
+    __specialized_for_target(hlsl)
+    void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd)
+    {
+        uint2 valueToAdd;
+        valueToAdd.x = uint(inValueToAdd);
+        valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); 
+        __atomicAdd(this, byteAddress, valueToAdd);
+    }
+
+    __specialized_for_target(glsl)
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
+    {
+        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+        __atomicAdd(buf[byteAddress / 8], valueToAdd);
+    }
+
 ${{{{
     }
 }}}}
diff --git a/tests/slang-extension/atomic-float-byte-address-buffer.slang b/tests/slang-extension/atomic-float-byte-address-buffer.slang
index e6e2268ff..d89ee8bd1 100644
--- a/tests/slang-extension/atomic-float-byte-address-buffer.slang
+++ b/tests/slang-extension/atomic-float-byte-address-buffer.slang
@@ -7,7 +7,7 @@
 // Disabled because requires nvapi to work
 // Note for this feature we require dxc and we can force that with -use-dxil
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-d3d12 -compute -use-dxil
-//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -use-dxil
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
 
 //TEST_INPUT:ubuffer(data=[0.1 0.2 0.3 0.4]):out,name=outputBuffer
 RWByteAddressBuffer outputBuffer;
diff --git a/tests/slang-extension/atomic-int64-byte-address-buffer.slang b/tests/slang-extension/atomic-int64-byte-address-buffer.slang
new file mode 100644
index 000000000..22f21408c
--- /dev/null
+++ b/tests/slang-extension/atomic-int64-byte-address-buffer.slang
@@ -0,0 +1,28 @@
+// No atomic support on CPU
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute 
+// No support for int64_t on DX11
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
+// No support for int64_t on fxc - we need SM6.0 and dxil
+// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12
+// Disable for now, because can only test when NVAPI is available, and it is not by default.
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil 
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+
+//TEST_INPUT:ubuffer(data=[0 1 2 3 4 5 6 7]):out,name=outputBuffer
+RWByteAddressBuffer outputBuffer;
+
+[numthreads(16, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{    
+    uint tid = dispatchThreadID.x;
+    int idx = (tid & 3) ^ (tid >> 2); 
+
+    int64_t previousValue = 0;
+    outputBuffer.InterlockedAddI64((idx << 3), 1, previousValue);
+    
+    int anotherIdx = tid >> 2;
+    outputBuffer.InterlockedAddI64(anotherIdx << 3, 3);
+}
+
diff --git a/tests/slang-extension/atomic-int64-byte-address-buffer.slang.expected.txt b/tests/slang-extension/atomic-int64-byte-address-buffer.slang.expected.txt
new file mode 100644
index 000000000..811dc1584
--- /dev/null
+++ b/tests/slang-extension/atomic-int64-byte-address-buffer.slang.expected.txt
@@ -0,0 +1,8 @@
+10
+1
+12
+3
+14
+5
+16
+7