From 2fffbc5ff0727482c6ab7d66f6d852701adb277b Mon Sep 17 00:00:00 2001
From: jsmall-nvidia <jsmall@nvidia.com>
Date: Wed, 19 Aug 2020 14:15:36 -0400
Subject: Int64 atomic add RWByteAddressBuffer support (#1504)

* Fix premake5.lua so it uses the new path needed for OpenCLDebugInfo100.h

* Keep including the includes directory.

* Added the spirv-tools-generated files.

* We don't need to include the spirv/unified1 path because the files needed are actually in the spirv-tools-generated folder.

* Put the build_info.h glslang generated files in external/glslang-generated. Alter premake5.lua to pick up that header.

* First pass at documenting how to build glslang and spirv-tools.

* Improved glsl/spir-v tools README.md

* Added revision.h

* Change how gResources is calculated.
Update about revision.h

* Update docs a little.

* Split out spirv-tools into a separate project for building glslang. This was not necessary on linux, but *is* necessary on windows, because there is a file disassemble.cpp in spirv-tools and in glslang, and this leads to VS choosing only one. With the separate library, the problem is resolved.

* Fix direct-spirv-emit output.

* Update to latest version of spirv headers and spirv-tools.

* Upgrade submodule version of glslang in external.

* Add fPIC to build options of slang-spirv-tools

* WIP adding support for InterlockedAddFp32

* Upgrade slang-binaries to have new glslang.

* Fix issues with Windows slang-glslang binaries, via update of slang-binaries used.

* WIP - atomicAdd. This solution can't work as we can't do (float*) in glsl.

* WIP on atomic float ops.

* Added checking for multiple decls that takes into account __target_intrinsic and __specialized_for_target.
First pass impl of atomic add on float for glsl.

* Split __atomicAdd so extensions are applied appropriately.

* Made Dxc/Fxc support includes.
Use HLSL prelude to pass the path to nvapi
Added -nv-api-path

* Refactor around IncludeHandler and impl of IncludeSystem

* slang-include-handler -> slang-include-system
Have IncludeHandler/Impl defined in slang-preprocessor

* Small comment improvements.

* Document atomic float add addition in target-compatibility.md.

* CUDA float atomic support on RWByteAddressBuffer.

* Add atomic-float-byte-address-buffer-cross.slang

* Removed inappropriate-once.slang - the test is no longer valid when a file is loaded and has a unique identity by default. A test could be made, but would require an API call to create the file (so no unique id).
Improved handling of loadFile - uses uniqueId if has one.

* Work around for testing target overlaps - to avoid exceptions on adding targets.
Simplify PathInfo setup.
Modify single-target-intrinsic.slang - it no longer failed because there were no longer multiple definitions for the same target.

* Int64 atomic add RwByteAddressBuffer support.

* Fix typo in stdlib for int atomic ByteAddressBuffer.

* Small fixes to int64 atomic test.

Co-authored-by: Tim Foley <tfoleyNV@users.noreply.github.com>
---
 source/slang/hlsl.meta.slang | 58 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

(limited to 'source/slang')

diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 329a73a33..46851269f 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -56,9 +56,12 @@ struct ByteAddressBuffer
 __target_intrinsic(glsl, "atomicAdd($0, $1)")
 __glsl_version(430)
 __glsl_extension(GL_EXT_shader_atomic_float)
-//__glsl_extension(GL_EXT_gpu_shader5)
 float __atomicAdd(__ref float value, float amount);
 
+// Helper for hlsl, using nvAPI
+__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
+uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2);
+
 // Int versions require glsl 4.30
 // https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
 
@@ -70,6 +73,10 @@ __target_intrinsic(glsl, "atomicAdd($0, $1)")
 __glsl_version(430)
 uint __atomicAdd(__ref uint value, uint amount);
 
+__target_intrinsic(glsl, "atomicAdd($0, $1)")
+__glsl_version(430)
+__glsl_extension(GL_EXT_shader_atomic_int64)
+int64_t __atomicAdd(__ref int64_t value, int64_t amount);
 
 __intrinsic_op($(kIROp_ByteAddressBufferLoad))
 T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);
@@ -192,6 +199,9 @@ ${{{{
     // NvAPI support on DX
     // NOTE! To use this feature on HLSL, the shader needs to include 'nvHLSLExtns.h' from the NvAPI SDK
     //
+
+    // Fp32 
+
     __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
     __target_intrinsic(cuda, "(*$3 = atomicAdd((float*)$0._getPtrAt($1), $2))")
     void InterlockedAddFp32(uint byteAddress, float valueToAdd, out float originalValue);
@@ -203,6 +213,8 @@ ${{{{
         originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
     }
 
+    // Without returning original value
+
     __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
     __target_intrinsic(cuda, "atomicAdd((float*)$0._getPtrAt($1), $2)")
     void InterlockedAddFp32(uint byteAddress, float valueToAdd);
@@ -214,6 +226,50 @@ ${{{{
         __atomicAdd(buf[byteAddress / 4], valueToAdd);
     }
 
+    // Int64
+    __cuda_sm_version(6.0)
+    __target_intrinsic(cuda, "(*$3 = atomicAdd((uint64_t*)$0._getPtrAt($1), $2))")
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);
+
+    __specialized_for_target(hlsl)
+    void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd, out int64_t outOriginalValue)
+    {
+        uint2 valueToAdd;
+        valueToAdd.x = uint(inValueToAdd);
+        valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); 
+
+        const uint2 originalValue = __atomicAdd(this, byteAddress, valueToAdd);
+        outOriginalValue = (int64_t(originalValue.y) << 32) | originalValue.x;
+    }
+
+    __specialized_for_target(glsl)
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
+    {
+        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
+    }
+
+    // Without returning original value
+    __cuda_sm_version(6.0)
+    __target_intrinsic(cuda, "atomicAdd((uint64_t*)$0._getPtrAt($1), $2)")
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
+
+    __specialized_for_target(hlsl)
+    void InterlockedAddI64(uint byteAddress, int64_t inValueToAdd)
+    {
+        uint2 valueToAdd;
+        valueToAdd.x = uint(inValueToAdd);
+        valueToAdd.y = uint(uint64_t(inValueToAdd) >> 32); 
+        __atomicAdd(this, byteAddress, valueToAdd);
+    }
+
+    __specialized_for_target(glsl)
+    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
+    {
+        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
+        __atomicAdd(buf[byteAddress / 8], valueToAdd);
+    }
+
 ${{{{
     }
 }}}}
-- 
cgit v1.2.3