diff options
| author | sriramm-nv <85252063+sriramm-nv@users.noreply.github.com> | 2024-04-16 23:59:41 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-04-16 23:59:41 -0700 |
| commit | 4b3f554a58e4224806c31d66874fbe60f1f09332 (patch) | |
| tree | ac19836249c875f2b8e3cdd74894a17d963ef1fd | |
| parent | 67313584d6879d68db53ced3108c2370bed5e8c1 (diff) | |
Force Inline all the InterlockedAdd functions in stdlib (#3965)
This change forcibly inlines the InterlockedAdd functions when using byteAddress buffer.
The IR generated when using nonUniformResourceInst on RWByteAddressBuffer:
buffer[NonUniformResourceIndex(uint(0))].InterlockedAdd(0, 1);
follows the sequence of a call into an index lookup that is wrapped by a nonuniformResourceIndex:
%ld = nonUniformResourceIndex(0)
Call RWStructBufferInterlockedAdd(%ld, 0, 1)
This prevents NonUniformResource decoration of the buffer because it is wrapped by the function call to
InterlockedAdd, that further expands to:
%gep = getElement(%buffer, 0)
SpirvAsmInst(..., rwStructuredBufferGEP(%gep, 0), ...)
By Force-Inlining the atomic functions, the buffer / resource is made visible to the nonUniformResourceIndex inst,
allowing the decoration.
Identified while debugging tests/spirv/coherent-2.slang
| -rw-r--r-- | source/slang/hlsl.meta.slang | 13 | ||||
| -rw-r--r-- | tests/slang-extension/atomic-float-byte-address-buffer-cross.slang | 12 |
2 files changed, 24 insertions, 1 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 3d712559d..9683da4a3 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2551,6 +2551,7 @@ ${{{{ __cuda_sm_version(2.0) [__requiresNVAPI] + [ForceInline] void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) { __target_switch @@ -2569,6 +2570,7 @@ ${{{{ // FP16x2 [__requiresNVAPI] + [ForceInline] uint _NvInterlockedAddFp16x2(uint byteAddress, uint fp16x2Value) { __target_switch @@ -2617,6 +2619,7 @@ ${{{{ // Without returning original value [__requiresNVAPI] + [ForceInline] __cuda_sm_version(2.0) void InterlockedAddF32(uint byteAddress, float valueToAdd) { @@ -2635,6 +2638,7 @@ ${{{{ } // Int64 Add + [ForceInline] __cuda_sm_version(6.0) void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue) { @@ -2657,12 +2661,14 @@ ${{{{ __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<uint64_t>($1), $2)") void InterlockedAddI64(uint byteAddress, int64_t valueToAdd); + [ForceInline] __specialized_for_target(hlsl) void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) { __atomicAdd(this, byteAddress, __asuint2(valueToAdd)); } + [ForceInline] __specialized_for_target(glsl) __specialized_for_target(spirv) void InterlockedAddI64(uint byteAddress, int64_t valueToAdd) @@ -3100,6 +3106,7 @@ ${{{{ }}}} // Added operations: + [ForceInline] void InterlockedAdd( UINT dest, UINT value, @@ -3116,6 +3123,7 @@ ${{{{ } } + [ForceInline] void InterlockedAdd( UINT dest, UINT value) @@ -5386,6 +5394,7 @@ void GroupMemoryBarrierWithGroupSync() // Atomics +[ForceInline] __glsl_version(430) void InterlockedAdd(__ref int dest, int value) { @@ -5402,6 +5411,7 @@ void InterlockedAdd(__ref int dest, int value) } } +[ForceInline] __glsl_version(430) void InterlockedAdd(__ref uint dest, uint value) { @@ -5424,6 +5434,7 @@ void InterlockedAdd(__ref uint dest, int value) InterlockedAdd(dest, (uint)value); } +[ForceInline] __glsl_version(430) void InterlockedAdd(__ref int dest, int value, out int original_value) { @@ -5441,6 +5452,7 @@ void InterlockedAdd(__ref int dest, int value, out int original_value) } } +[ForceInline] __glsl_version(430) void InterlockedAdd(__ref uint dest, uint value, out uint original_value) { @@ -14572,6 +14584,7 @@ __generic<Shape:__ITextureShape1D2D3D, let format : int> extension __TextureImpl<float, Shape, 0, 0, 0, $(kStdlibResourceAccessReadWrite), 0, 0, format> { [__requiresNVAPI] + [ForceInline] __glsl_extension(GL_EXT_shader_atomic_float) void InterlockedAddF32(vector<uint, Shape.dimensions> coord, float value, out float originalValue) { diff --git a/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang b/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang index ffa6d5b94..523c58984 100644 --- a/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang +++ b/tests/slang-extension/atomic-float-byte-address-buffer-cross.slang @@ -1,6 +1,6 @@ // atomic-float-byte-address-buffer-cross.slang -//TEST:CROSS_COMPILE: -profile cs_6_5 -entry computeMain -target spirv-assembly +//TEST:SIMPLE(filecheck=CHECK): -profile cs_6_5 -entry computeMain -target spirv-assembly // We can't do this test, because it relies on nvAPI //DISABLE_TEST:CROSS_COMPILE: -profile cs_6_5 -entry computeMain -target dxil @@ -13,6 +13,16 @@ RWStructuredBuffer<float> anotherBuffer; [numthreads(16, 1, 1)] void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) { + // CHECK-DAG: OpDecorate %[[V1:[a-zA-Z0-9_]+]] Binding 1 + // CHECK-DAG: OpDecorate %[[V2:[a-zA-Z0-9_]+]] Binding 0 + // CHECK-DAG: %[[P1:[a-zA-Z0-9_]+]] = OpTypePointer Uniform %float + // CHECK-DAG: %[[P2:[a-zA-Z0-9_]+]] = OpTypePointer Input %uint + // CHECK: OpAccessChain %[[P2]] + // CHECK: OpAccessChain %[[P1]] %[[V1]] + // CHECK: OpAccessChain %[[P1]] %[[V2]] + // CHECK: OpAtomicFAddEXT + // CHECK: OpAccessChain %[[P1]] %[[V2]] + // CHECK: OpAtomicFAddEXT uint tid = dispatchThreadID.x; int idx = int((tid & 3) ^ (tid >> 2)); |
