Force Inline all the InterlockedAdd functions in stdlib (#3965)

This change forcibly inlines the InterlockedAdd functions when using byteAddress buffer. The IR generated when using nonUniformResourceInst on RWByteAddressBuffer: buffer[NonUniformResourceIndex(uint(0))].InterlockedAdd(0, 1); follows the sequence of a call into an index lookup that is wrapped by a nonuniformResourceIndex: %ld = nonUniformResourceIndex(0) Call RWStructBufferInterlockedAdd(%ld, 0, 1) This prevents NonUniformResource decoration of the buffer because it is wrapped by the function call to InterlockedAdd, that further expands to: %gep = getElement(%buffer, 0) SpirvAsmInst(..., rwStructuredBufferGEP(%gep, 0), ...) By Force-Inlining the atomic functions, the buffer / resource is made visible to the nonUniformResourceIndex inst, allowing the decoration. Identified while debugging tests/spirv/coherent-2.slang
author: sriramm-nv <85252063+sriramm-nv@users.noreply.github.com> 2024-04-16 23:59:41 -0700
committer: GitHub <noreply@github.com> 2024-04-16 23:59:41 -0700
commit: 4b3f554a58e4224806c31d66874fbe60f1f09332 (patch)
tree: ac19836249c875f2b8e3cdd74894a17d963ef1fd
parent: 67313584d6879d68db53ced3108c2370bed5e8c1 (diff)
2 files changed, 24 insertions, 1 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 3d712559d..9683da4a3 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2551,6 +2551,7 @@ ${{{{
 
     __cuda_sm_version(2.0)
     [__requiresNVAPI]
+    [ForceInline]
     void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue)
     {
         __target_switch
@@ -2569,6 +2570,7 @@ ${{{{
 
     // FP16x2
     [__requiresNVAPI]
+    [ForceInline]
     uint _NvInterlockedAddFp16x2(uint byteAddress, uint fp16x2Value)
     {
         __target_switch
@@ -2617,6 +2619,7 @@ ${{{{
     // Without returning original value
 
     [__requiresNVAPI]
+    [ForceInline]
     __cuda_sm_version(2.0)
     void InterlockedAddF32(uint byteAddress, float valueToAdd)
     {
@@ -2635,6 +2638,7 @@ ${{{{
     }
 
     // Int64 Add
+    [ForceInline]
     __cuda_sm_version(6.0)
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
     {
@@ -2657,12 +2661,14 @@ ${{{{
     __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<uint64_t>($1), $2)")
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);
 
+    [ForceInline]
     __specialized_for_target(hlsl)
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
     {
         __atomicAdd(this, byteAddress, __asuint2(valueToAdd));
     }
 
+    [ForceInline]
     __specialized_for_target(glsl)
     __specialized_for_target(spirv)
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
@@ -3100,6 +3106,7 @@ ${{{{
 }}}}
 
     // Added operations:
+    [ForceInline]
     void InterlockedAdd(
         UINT dest,
         UINT value,
@@ -3116,6 +3123,7 @@ ${{{{
         }
     }
 
+    [ForceInline]
     void InterlockedAdd(
         UINT dest,
         UINT value)
@@ -5386,6 +5394,7 @@ void GroupMemoryBarrierWithGroupSync()
 
 // Atomics
 
+[ForceInline]
 __glsl_version(430)
 void InterlockedAdd(__ref  int dest,  int value)
 {
@@ -5402,6 +5411,7 @@ void InterlockedAdd(__ref  int dest,  int value)
     }
 }
 
+[ForceInline]
 __glsl_version(430)
 void InterlockedAdd(__ref uint dest, uint value)
 {
@@ -5424,6 +5434,7 @@ void InterlockedAdd(__ref uint dest, int value)
     InterlockedAdd(dest, (uint)value);
 }
 
+[ForceInline]
 __glsl_version(430)
 void InterlockedAdd(__ref  int dest,  int value, out  int original_value)
 {
@@ -5441,6 +5452,7 @@ void InterlockedAdd(__ref  int dest,  int value, out  int original_value)
     }
 }
 
+[ForceInline]
 __glsl_version(430)
 void InterlockedAdd(__ref uint dest, uint value, out uint original_value)
 {
@@ -14572,6 +14584,7 @@ __generic<Shape:__ITextureShape1D2D3D, let format : int>
 extension __TextureImpl<float, Shape, 0, 0, 0, $(kStdlibResourceAccessReadWrite), 0, 0, format>
 {
     [__requiresNVAPI]
+    [ForceInline]
     __glsl_extension(GL_EXT_shader_atomic_float)
     void InterlockedAddF32(vector<uint, Shape.dimensions> coord, float value, out float originalValue)
     {
diff --git a/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang b/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang
index ffa6d5b94..523c58984 100644
--- a/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang
+++ b/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang
@@ -1,6 +1,6 @@
 // atomic-float-byte-address-buffer-cross.slang
 
-//TEST:CROSS_COMPILE: -profile cs_6_5 -entry computeMain -target spirv-assembly
+//TEST:SIMPLE(filecheck=CHECK): -profile cs_6_5 -entry computeMain -target spirv-assembly
 // We can't do this test, because it relies on nvAPI
 //DISABLE_TEST:CROSS_COMPILE: -profile cs_6_5 -entry computeMain -target dxil
 
@@ -13,6 +13,16 @@ RWStructuredBuffer<float> anotherBuffer;
 [numthreads(16, 1, 1)]
 void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
 {
+    // CHECK-DAG: OpDecorate %[[V1:[a-zA-Z0-9_]+]] Binding 1
+    // CHECK-DAG: OpDecorate %[[V2:[a-zA-Z0-9_]+]] Binding 0
+    // CHECK-DAG: %[[P1:[a-zA-Z0-9_]+]] = OpTypePointer Uniform %float
+    // CHECK-DAG: %[[P2:[a-zA-Z0-9_]+]] = OpTypePointer Input %uint
+    // CHECK: OpAccessChain %[[P2]]
+    // CHECK: OpAccessChain %[[P1]] %[[V1]]
+    // CHECK: OpAccessChain %[[P1]] %[[V2]]
+    // CHECK: OpAtomicFAddEXT
+    // CHECK: OpAccessChain %[[P1]] %[[V2]]
+    // CHECK: OpAtomicFAddEXT
     uint tid = dispatchThreadID.x;
     int idx = int((tid & 3) ^ (tid >> 2));
author	sriramm-nv <85252063+sriramm-nv@users.noreply.github.com>	2024-04-16 23:59:41 -0700
committer	GitHub <noreply@github.com>	2024-04-16 23:59:41 -0700
commit	4b3f554a58e4224806c31d66874fbe60f1f09332 (patch)
tree	ac19836249c875f2b8e3cdd74894a17d963ef1fd
parent	67313584d6879d68db53ced3108c2370bed5e8c1 (diff)