diff options
| author | ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> | 2024-07-19 02:05:33 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-07-18 23:05:33 -0700 |
| commit | a00d603519d395d41b2f68c5874e8a708335a31a (patch) | |
| tree | 114e3da71d3d95034e944edb0ffd1510f192418d /source | |
| parent | 59dd133f1c52fb0a7a388f4a8f42234f4556a28a (diff) | |
Metal: `Interlocked` (atomic) member function support for buffers (#4655)
* Metal: `Interlocked` (atomic) member function support for buffers
fixes: #4654
fixes: #4481
1. Add `Interlocked` (atomic) member function support for buffers to Metal
2. Fix `__getEquivalentStructuredBuffer` so it works with CPP/Metal targets
* add `CompareStore` support
* legalize RWByteAddressBuffer to fully replace StructuredBuffer
* destroy replaced byte-addr buffer
* cleanup as per review and add comment to explain why certain code exists
* fix flow of byte-address-buffer replacement
* toggle on option to translate byteAddrBuffer to StructuredBuffer
* cleanup unused buffers
* add treatGetEquivalentStructuredBufferAsGetThis flag to treat getEquivStructuredBuffer as a byteAddressBuffer
* comment to explain `treatGetEquivalentStructuredBufferAsGetThis`
---------
Co-authored-by: Yong He <yonghe@outlook.com>
Diffstat (limited to 'source')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 155 | ||||
| -rw-r--r-- | source/slang/slang-capabilities.capdef | 4 | ||||
| -rw-r--r-- | source/slang/slang-emit.cpp | 3 | ||||
| -rw-r--r-- | source/slang/slang-ir-byte-address-legalize.cpp | 7 | ||||
| -rw-r--r-- | source/slang/slang-ir-byte-address-legalize.h | 5 |
5 files changed, 161 insertions, 13 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 2639c1e88..9760f974a 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -4193,13 +4193,19 @@ ${{{{ __cuda_sm_version(2.0) [__requiresNVAPI] [ForceInline] - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda_float1)] + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue) { __target_switch { case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))"; case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))"; + case metal: + { + let buf = __getEquivalentStructuredBuffer<float>(this); + __metalInterlocked_add(__getMetalAtomicRef(buf[byteAddress / 4]), valueToAdd, originalValue); + return; + } case glsl: case spirv: { @@ -4264,13 +4270,19 @@ ${{{{ [__requiresNVAPI] [ForceInline] __cuda_sm_version(2.0) - [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda_float1)] + [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] void InterlockedAddF32(uint byteAddress, float valueToAdd) { __target_switch { case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))"; case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<float>($1), $2)"; + case metal: + { + let buf = __getEquivalentStructuredBuffer<float>(this); + __metalInterlocked_add(__getMetalAtomicRef(buf[byteAddress / 4]), valueToAdd); + return; + } case glsl: case spirv: { @@ -4763,6 +4775,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedAdd"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_add(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAdd(buf[dest / 4], value, original_value); @@ -4781,6 +4798,11 @@ ${{{{ case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedAdd"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_add(__getMetalAtomicRef(buf[dest / 4]), value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAdd(buf[dest / 4], value); @@ -4800,6 +4822,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedAnd"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_and(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAnd(buf[dest / 4], value, original_value); @@ -4818,6 +4845,11 @@ ${{{{ case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedAnd"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_and(__getMetalAtomicRef(buf[dest / 4]), value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAnd(buf[dest / 4], value); @@ -4838,6 +4870,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))"; case hlsl: __intrinsic_asm ".InterlockedCompareExchange"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_compare_exchange(__getMetalAtomicRef(buf[dest / 4]), compare_value, value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value); @@ -4845,7 +4882,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedCompareStore( UINT dest, UINT compare_value, @@ -4856,6 +4893,12 @@ ${{{{ case glsl: __intrinsic_asm "atomicCompSwap($0._data[$1/4], $2, $3)"; case cuda: __intrinsic_asm "atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3)"; case hlsl: __intrinsic_asm ".InterlockedCompareStore"; + case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_compare_exchange(__getMetalAtomicRef(buf[dest / 4]), compare_value, value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedCompareStore(buf[dest / 4], compare_value, value); @@ -4875,6 +4918,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedExchange"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_exchange(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedExchange(buf[dest / 4], value, original_value); @@ -4894,6 +4942,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedMax"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_max(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMax(buf[dest / 4], value, original_value); @@ -4912,6 +4965,11 @@ ${{{{ case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedMax"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_max(__getMetalAtomicRef(buf[dest / 4]), value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMax(buf[dest / 4], value); @@ -4931,6 +4989,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedMin"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_min(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMin(buf[dest / 4], value, original_value); @@ -4949,6 +5012,11 @@ ${{{{ case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedMin"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_min(__getMetalAtomicRef(buf[dest / 4]), value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMin(buf[dest / 4], value); @@ -4968,6 +5036,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedOr"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_or(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedOr(buf[dest / 4], value, original_value); @@ -4986,6 +5059,11 @@ ${{{{ case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedOr"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_or(__getMetalAtomicRef(buf[dest / 4]), value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedOr(buf[dest / 4], value); @@ -5005,6 +5083,11 @@ ${{{{ case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedXor"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_xor(__getMetalAtomicRef(buf[dest / 4]), value, original_value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedXor(buf[dest / 4], value, original_value); @@ -5023,6 +5106,11 @@ ${{{{ case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedXor"; case metal: + { + let buf = __getEquivalentStructuredBuffer<uint>(this); + __metalInterlocked_xor(__getMetalAtomicRef(buf[dest / 4]), value); + return; + } case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedXor(buf[dest / 4], value); @@ -9254,7 +9342,7 @@ for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest, $(T) value) { static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); @@ -9288,7 +9376,7 @@ void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest, $(T) value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest, $(T) value, out $(T) original_value) { static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to a scalar texture or non-texture"); @@ -9335,7 +9423,7 @@ for(const char* T : {"int64_t", "uint64_t"}) { }}}} [ForceInline] -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda_metal)] void InterlockedAdd(__ref $(T) dest, $(T) value) { __target_switch @@ -9515,7 +9603,7 @@ ${{{{ [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value) { static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); @@ -9550,7 +9638,7 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value, ou [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value) { static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture"); @@ -9613,7 +9701,7 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] void InterlockedCompareStore(__ref int dest, int compare_value, int value) { __target_switch @@ -9622,16 +9710,41 @@ void InterlockedCompareStore(__ref int dest, int compare_value, int value) case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)"; case cuda: __intrinsic_asm "atomicCAS($0, $1, $2)"; case spirv: + { spirv_asm { result:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value; }; + return; + } + case metal: + { + if (__isTextureAccess(dest)) + { + vector<int, 4> vec_compare_value = vector<int, 4>(compare_value); + if(__isTextureArrayAccess(dest)) + { + __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), + __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector<int, 4>(value)); + } + else + { + __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), + __extractCoordFromTextureAccess(dest), vec_compare_value, vector<int, 4>(value)); + } + } + else + { + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); + } + return; + } } } [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)] void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value) { __target_switch @@ -9644,6 +9757,26 @@ void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value) { result:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value; }; + case metal: + if (__isTextureAccess(dest)) + { + vector<uint, 4> vec_compare_value = vector<uint, 4>(compare_value); + if(__isTextureArrayAccess(dest)) + { + __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), + __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector<uint, 4>(value)); + } + else + { + __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), + __extractCoordFromTextureAccess(dest), vec_compare_value, vector<uint, 4>(value)); + } + } + else + { + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); + } + return; } } @@ -20006,7 +20139,7 @@ extension __TextureImpl<float, Shape, 0, 0, 0, $(kStdlibResourceAccessReadWrite) [__requiresNVAPI] [ForceInline] __glsl_extension(GL_EXT_shader_atomic_float) - [require(glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda_float1)] + [require(glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda_metal_float1)] void InterlockedAddF32(vector<uint, Shape.dimensions> coord, float value, out float originalValue) { __target_switch diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef index a27146bf9..b13571d18 100644 --- a/source/slang/slang-capabilities.capdef +++ b/source/slang/slang-capabilities.capdef @@ -820,12 +820,12 @@ alias subgroup_clustered = GL_KHR_shader_subgroup_clustered | _sm_6_0 | _cuda_sm alias subgroup_quad = GL_KHR_shader_subgroup_quad | _sm_6_0 | _cuda_sm_7_0; alias subgroup_partitioned = GL_NV_shader_subgroup_partitioned + subgroup_ballot_activemask | _sm_6_5 | _cuda_sm_7_0; -alias atomic_glsl_hlsl_nvapi_cuda_float1 = atomic_glsl_float1 | hlsl_nvapi + _sm_4_0 | _cuda_sm_2_0; +alias atomic_glsl_hlsl_nvapi_cuda_metal_float1 = atomic_glsl_float1 | hlsl_nvapi + _sm_4_0 | _cuda_sm_2_0 | metal; alias atomic_glsl_hlsl_nvapi_cuda5_int64 = atomic_glsl_int64 | hlsl_nvapi + _sm_4_0 | _cuda_sm_6_0; alias atomic_glsl_hlsl_nvapi_cuda6_int64 = atomic_glsl_int64 | hlsl_nvapi + _sm_4_0 | _cuda_sm_6_0; alias atomic_glsl_hlsl_nvapi_cuda9_int64 = atomic_glsl_int64 | hlsl_nvapi + _sm_4_0 | _cuda_sm_9_0; -alias atomic_glsl_hlsl_cuda = atomic_glsl | _sm_5_0 | _cuda_sm_2_0 | metal; +alias atomic_glsl_hlsl_cuda_metal = atomic_glsl | _sm_5_0 | _cuda_sm_2_0 | metal; alias atomic_glsl_hlsl_cuda9_int64 = atomic_glsl_int64 | _sm_6_6 | _cuda_sm_9_0 | metal; alias helper_lane = _sm_6_0 + fragment diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp index 678b4137a..b690b7c38 100644 --- a/source/slang/slang-emit.cpp +++ b/source/slang/slang-emit.cpp @@ -1039,6 +1039,7 @@ Result linkAndOptimizeIR( case CodeGenTarget::MetalLib: case CodeGenTarget::MetalLibAssembly: byteAddressBufferOptions.scalarizeVectorLoadStore = true; + byteAddressBufferOptions.treatGetEquivalentStructuredBufferAsGetThis = true; byteAddressBufferOptions.translateToStructuredBufferOps = false; byteAddressBufferOptions.lowerBasicTypeOps = true; break; @@ -1135,6 +1136,8 @@ Result linkAndOptimizeIR( } break; case CodeGenTarget::Metal: + case CodeGenTarget::MetalLib: + case CodeGenTarget::MetalLibAssembly: { legalizeIRForMetal(irModule, sink); } diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp index dba3ab5f5..d5685bad6 100644 --- a/source/slang/slang-ir-byte-address-legalize.cpp +++ b/source/slang/slang-ir-byte-address-legalize.cpp @@ -38,6 +38,8 @@ struct ByteAddressBufferLegalizationContext IRModule* m_module; IRBuilder m_builder; + Dictionary<IRInst*, IRType*> byteAddrBufferToReplace; + // Everything starts with a request to process a module, // which delegates to the central recrusive walk of the IR. // @@ -787,10 +789,15 @@ struct ByteAddressBufferLegalizationContext IRInst* getEquivalentStructuredBuffer(IRType* elementType, IRInst* byteAddressBuffer) { + if (this->m_options.treatGetEquivalentStructuredBufferAsGetThis) + return byteAddressBuffer; + if (!elementType) { return nullptr; } + if (as<IRHLSLStructuredBufferTypeBase>(byteAddressBuffer->getDataType())) + return byteAddressBuffer; // The simple case for replacement is when the byte-address buffer to // be replaced is a global shader parameter. That path will get its // own routine. diff --git a/source/slang/slang-ir-byte-address-legalize.h b/source/slang/slang-ir-byte-address-legalize.h index 1ae69070e..8a92bcf33 100644 --- a/source/slang/slang-ir-byte-address-legalize.h +++ b/source/slang/slang-ir-byte-address-legalize.h @@ -14,6 +14,11 @@ struct ByteAddressBufferLegalizationOptions bool useBitCastFromUInt = false; bool translateToStructuredBufferOps = false; bool lowerBasicTypeOps = false; + + /// Causes all calls to `getEquivlentStructuredBuffer` to return a `ByteAddressBuffer` (this) instead of a `StructuredBuffer`. + /// This option is used for targets that do not distinctly define `ByteAddressBuffer`/`StructuredBuffer` and introduce + /// operations which prevent DCE from destroying old definitions of `ByteAddressBuffer` after variable replacement. + bool treatGetEquivalentStructuredBufferAsGetThis = false; }; /// Legalize byte-address buffer `Load()` and `Store()` operations. |
