diff options
| author | Jay Kwak <82421531+jkwak-work@users.noreply.github.com> | 2024-06-25 22:07:41 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-06-25 22:07:41 -0700 |
| commit | 969dd4cc7246bfe89103efcb00f399606e804e98 (patch) | |
| tree | 6b44527d72a08f4b39848bf5cc2efe03ed5e6c90 | |
| parent | 63e0064bd3a2007adf17a35d3c58894d90ddc04a (diff) | |
Support atomic intrinsics for Metal (#4473)
* Support atomic intrinsics for Metal
This commit adds a support for the atomic intrinsics in Metal.
The atomic member functions for buffers is not implemented yet.
Metal requires the first argument for the atomic functions to be an
atomic data type. This implementation rely on the fact that we can do a
C-style type casting from a regular data type to an atomic data type.
| -rw-r--r-- | source/slang/hlsl.meta.slang | 297 | ||||
| -rw-r--r-- | source/slang/slang-emit-c-like.cpp | 1 | ||||
| -rw-r--r-- | source/slang/slang-emit-metal.cpp | 18 | ||||
| -rw-r--r-- | source/slang/slang-ir-inst-defs.h | 1 | ||||
| -rw-r--r-- | tests/bugs/atomic-coerce.slang | 2 | ||||
| -rw-r--r-- | tests/compute/atomics-groupshared.slang | 2 | ||||
| -rw-r--r-- | tests/compute/atomics.slang | 2 | ||||
| -rw-r--r-- | tests/metal/atomic-intrinsics.slang | 352 |
8 files changed, 598 insertions, 77 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 82ef5837e..597e4dc06 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -4633,7 +4633,7 @@ ${{{{ // Added operations: [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAdd( UINT dest, UINT value, @@ -4644,6 +4644,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedAdd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAdd(buf[dest / 4], value, original_value); @@ -4651,7 +4652,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAdd( UINT dest, UINT value) @@ -4661,6 +4662,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedAdd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAdd(buf[dest / 4], value); @@ -4668,7 +4670,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAnd( UINT dest, UINT value, @@ -4679,6 +4681,7 @@ ${{{{ case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedAnd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAnd(buf[dest / 4], value, original_value); @@ -4686,7 +4689,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAnd( UINT dest, UINT value) @@ -4696,6 +4699,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedAnd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAnd(buf[dest / 4], value); @@ -4703,7 +4707,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedCompareExchange( UINT dest, UINT compare_value, @@ -4715,6 +4719,7 @@ ${{{{ case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))"; case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))"; case hlsl: __intrinsic_asm ".InterlockedCompareExchange"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value); @@ -4740,7 +4745,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedExchange( UINT dest, UINT value, @@ -4751,6 +4756,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedExchange"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedExchange(buf[dest / 4], value, original_value); @@ -4758,7 +4764,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMax( UINT dest, UINT value, @@ -4769,6 +4775,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedMax"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMax(buf[dest / 4], value, original_value); @@ -4776,7 +4783,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMax( UINT dest, UINT value) @@ -4786,6 +4793,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedMax"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMax(buf[dest / 4], value); @@ -4793,7 +4801,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMin( UINT dest, UINT value, @@ -4804,6 +4812,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedMin"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMin(buf[dest / 4], value, original_value); @@ -4811,7 +4820,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMin( UINT dest, UINT value) @@ -4821,6 +4830,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedMin"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMin(buf[dest / 4], value); @@ -4828,7 +4838,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedOr( UINT dest, UINT value, @@ -4839,6 +4849,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedOr"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedOr(buf[dest / 4], value, original_value); @@ -4846,7 +4857,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedOr( UINT dest, UINT value) @@ -4856,6 +4867,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedOr"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedOr(buf[dest / 4], value); @@ -4863,7 +4875,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedXor( UINT dest, UINT value, @@ -4874,6 +4886,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedXor"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedXor(buf[dest / 4], value, original_value); @@ -4881,7 +4894,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedXor( UINT dest, UINT value) @@ -4891,6 +4904,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedXor"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedXor(buf[dest / 4], value); @@ -8596,9 +8610,62 @@ void GroupMemoryBarrierWithGroupSync() // Atomics +__generic<T> +__intrinsic_op($(kIROp_MetalAtomicCast)) +[require(metal)] +T* __getMetalAtomicRef(__ref T x); + +${{{{ +for (const char* fetchAndModify : {"add", "and", "max", "min", "or", "sub", "xor"}) +{ +}}}} + __generic<AtomicType, T> + [ForceInline] + [require(metal)] + void __metalInterlocked_$(fetchAndModify)(AtomicType dest, T value) + { + __intrinsic_asm "atomic_fetch_$(fetchAndModify)_explicit($0, $1, memory_order_relaxed)"; + } + + __generic<AtomicType, T> + [ForceInline] + [require(metal)] + void __metalInterlocked_$(fetchAndModify)(AtomicType dest, T value, out T original_value) + { + __intrinsic_asm "((*($2)) = (($[0])(atomic_fetch_$(fetchAndModify)_explicit($0, $1, memory_order_relaxed))))", T; + } +${{{{ +} // fetchAndModify +}}}} + +__generic<AtomicType, T> [ForceInline] +[require(metal)] +void __metalInterlocked_exchange(AtomicType dest, T value, out T original_value) +{ + __intrinsic_asm "((*($2)) = (($[0])(atomic_exchange_explicit($0, $1, memory_order_relaxed))))", T; +} + +__generic<AtomicType, T> +[ForceInline] +[require(metal)] +void __metalInterlocked_compare_exchange(AtomicType dest, __ref T compare_value, T value) +{ + __intrinsic_asm "atomic_compare_exchange_weak_explicit($0, $1, $2, memory_order_relaxed, memory_order_relaxed)"; +} + +__generic<AtomicType, T> +[ForceInline] +[require(metal)] +void __metalInterlocked_compare_exchange(AtomicType dest, T compare_value, T value, out T original_value) +{ + __metalInterlocked_compare_exchange(dest, compare_value, value); + original_value = compare_value; +} + __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[ForceInline] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAdd(__ref int dest, int value) { __target_switch @@ -8606,6 +8673,9 @@ void InterlockedAdd(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "atomicAdd($0, $1)"; case glsl: __intrinsic_asm "$atomicAdd($A, $1)"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8616,7 +8686,7 @@ void InterlockedAdd(__ref int dest, int value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAdd(__ref uint dest, uint value) { __target_switch @@ -8624,6 +8694,9 @@ void InterlockedAdd(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "atomicAdd((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicAdd($A, $1)"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8640,14 +8713,17 @@ void InterlockedAdd(__ref uint dest, int value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] -void InterlockedAdd(__ref int dest, int value, out int original_value) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +void InterlockedAdd(__ref int dest, int value, out int original_value) { __target_switch { case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "(*$2 = atomicAdd($0, $1))"; case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8659,7 +8735,7 @@ void InterlockedAdd(__ref int dest, int value, out int original_value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAdd(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -8667,6 +8743,9 @@ void InterlockedAdd(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicAdd((int*)$0, $1))"; case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8758,14 +8837,17 @@ void InterlockedAdd(__ref uint64_t dest, uint64_t value, out uint64_t original_v __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] -void InterlockedAnd(__ref int dest, int value) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +void InterlockedAnd(__ref int dest, int value) { __target_switch { case hlsl: __intrinsic_asm "InterlockedAnd"; case cuda: __intrinsic_asm "atomicAnd($0, $1)"; case glsl: __intrinsic_asm "$atomicAnd($A, $1)"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8775,7 +8857,7 @@ void InterlockedAnd(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAnd(__ref uint dest, uint value) { __target_switch @@ -8783,6 +8865,9 @@ void InterlockedAnd(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedAnd"; case cuda: __intrinsic_asm "atomicAnd((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicAnd($A, $1)"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8792,14 +8877,17 @@ void InterlockedAnd(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] -void InterlockedAnd(__ref int dest, int value, out int original_value) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +void InterlockedAnd(__ref int dest, int value, out int original_value) { __target_switch { case hlsl: __intrinsic_asm "InterlockedAnd"; case cuda: __intrinsic_asm "(*$2 = atomicAnd($0, $1))"; case glsl: __intrinsic_asm "($2 = $atomicAnd($A, $1))"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8810,7 +8898,7 @@ void InterlockedAnd(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAnd(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -8818,6 +8906,9 @@ void InterlockedAnd(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedAnd"; case glsl: __intrinsic_asm "($2 = atomicAnd($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicAnd((int*)$0, $1))"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8846,7 +8937,7 @@ void InterlockedAnd(__ref uint64_t dest, uint64_t value, out uint64_t origina } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value) { __target_switch @@ -8854,6 +8945,9 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value, case hlsl: __intrinsic_asm "InterlockedCompareExchange"; case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; case cuda: __intrinsic_asm "(*$3 = atomicCAS($0, $1, $2))"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); + return; case spirv: spirv_asm { @@ -8864,7 +8958,7 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value, } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value) { __target_switch @@ -8872,6 +8966,9 @@ void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, case hlsl: __intrinsic_asm "InterlockedCompareExchange"; case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; case cuda: __intrinsic_asm "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); + return; case spirv: spirv_asm { @@ -8887,6 +8984,9 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val __target_switch { case hlsl: __intrinsic_asm "InterlockedCompareExchangeFloatBitwise"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); + return; } } @@ -8896,44 +8996,36 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val __target_switch { case hlsl: __intrinsic_asm "InterlockedCompareExchangeFloatBitwise"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); + return; } } -[ForceInline] -void InterlockedCompareExchange(__ref int64_t dest, int64_t compare_value, int64_t value) -{ - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; - } -} - -[ForceInline] -void InterlockedCompareExchange(__ref int64_t dest, int64_t compare_value, int64_t value, out int64_t original_value) -{ - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; - } -} - -[ForceInline] -void InterlockedCompareExchange(__ref uint64_t dest, uint64_t compare_value, uint64_t value) +${{{{ +for (const char* T : {"int64_t", "uint64_t"}) { - __target_switch +}}}} + [ForceInline] + void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value) { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + } } -} -[ForceInline] -void InterlockedCompareExchange(__ref uint64_t dest, uint64_t compare_value, uint64_t value, out uint64_t original_value) -{ - __target_switch + [ForceInline] + void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value, out $(T) original_value) { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + } } -} +${{{{ +} // T +}}}} __glsl_version(430) [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] @@ -8997,7 +9089,7 @@ void InterlockedCompareStore(__ref uint64_t dest, uint64_t compare_value, uint64 } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedExchange(__ref int dest, int value, out int original_value) { __target_switch @@ -9005,6 +9097,9 @@ void InterlockedExchange(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedExchange"; case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicExch($0, $1))"; + case metal: + __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9015,7 +9110,7 @@ void InterlockedExchange(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedExchange(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9023,6 +9118,9 @@ void InterlockedExchange(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedExchange"; case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicExch((int*)$0, $1))"; + case metal: + __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9047,6 +9145,9 @@ void InterlockedExchange(__ref float dest, float value, out float original_va __target_switch { case hlsl: __intrinsic_asm "InterlockedExchange"; + case metal: + __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + return; } } @@ -9087,7 +9188,7 @@ void InterlockedExchange(__ref uint64_t dest, uint64_t value, out uint64_t or } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref int dest, int value) { __target_switch @@ -9095,6 +9196,9 @@ void InterlockedMax(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "$atomicMax($A, $1)"; case cuda: __intrinsic_asm "atomicMax($0, $1)"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9104,7 +9208,7 @@ void InterlockedMax(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref uint dest, uint value) { __target_switch @@ -9112,6 +9216,9 @@ void InterlockedMax(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "$atomicMax($A, $1)"; case cuda: __intrinsic_asm "atomicMax((int*)$0, $1)"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9121,7 +9228,7 @@ void InterlockedMax(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref int dest, int value, out int original_value) { __target_switch @@ -9129,6 +9236,9 @@ void InterlockedMax(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicMax($0, $1))"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9139,7 +9249,7 @@ void InterlockedMax(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9147,6 +9257,9 @@ void InterlockedMax(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicMax((int*)$0, $1))"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9193,7 +9306,7 @@ void InterlockedMax(__ref uint64_t dest, uint64_t value, out uint64_t origina } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref int dest, int value) { __target_switch @@ -9201,6 +9314,9 @@ void InterlockedMin(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "$atomicMin($A, $1)"; case cuda: __intrinsic_asm "atomicMin($0, $1)"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9210,7 +9326,7 @@ void InterlockedMin(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref uint dest, uint value) { __target_switch @@ -9218,6 +9334,9 @@ void InterlockedMin(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "$atomicMin($A, $1)"; case cuda: __intrinsic_asm "atomicMin((int*)$0, $1)"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9227,7 +9346,7 @@ void InterlockedMin(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref int dest, int value, out int original_value) { __target_switch @@ -9235,6 +9354,9 @@ void InterlockedMin(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicMin($0, $1))"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9245,7 +9367,7 @@ void InterlockedMin(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9253,6 +9375,9 @@ void InterlockedMin(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicMin((int*)$0, $1))"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9299,7 +9424,7 @@ void InterlockedMin(__ref uint64_t dest, uint64_t value, out uint64_t origina } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref int dest, int value) { __target_switch @@ -9307,6 +9432,9 @@ void InterlockedOr(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedOr"; case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicOr($A, $1)"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9316,7 +9444,7 @@ void InterlockedOr(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref uint dest, uint value) { __target_switch @@ -9324,6 +9452,9 @@ void InterlockedOr(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedOr"; case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicOr($A, $1)"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9333,7 +9464,7 @@ void InterlockedOr(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref int dest, int value, out int original_value) { __target_switch @@ -9341,6 +9472,9 @@ void InterlockedOr(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedOr"; case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicOr($0, $1))"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9351,7 +9485,7 @@ void InterlockedOr(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9359,6 +9493,9 @@ void InterlockedOr(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedOr"; case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicOr((int*)$0, $1))"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9387,7 +9524,7 @@ void InterlockedOr(__ref uint64_t dest, uint64_t value, out uint64_t original } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref int dest, int value) { __target_switch @@ -9395,6 +9532,9 @@ void InterlockedXor(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedXor"; case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicXor($A, $1)"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9404,7 +9544,7 @@ void InterlockedXor(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref uint dest, uint value) { __target_switch @@ -9412,6 +9552,9 @@ void InterlockedXor(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedXor"; case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicXor($A, $1)"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9421,7 +9564,7 @@ void InterlockedXor(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref int dest, int value, out int original_value) { __target_switch @@ -9429,6 +9572,9 @@ void InterlockedXor(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedXor"; case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicXor($0, $1))"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9439,7 +9585,7 @@ void InterlockedXor(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9447,6 +9593,9 @@ void InterlockedXor(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedXor"; case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicXor((int*)$0, $1))"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp index e9ab58bca..6062875b3 100644 --- a/source/slang/slang-emit-c-like.cpp +++ b/source/slang/slang-emit-c-like.cpp @@ -2889,6 +2889,7 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst) case kIROp_AtomicCounterIncrement: case kIROp_AtomicCounterDecrement: case kIROp_StructuredBufferGetDimensions: + case kIROp_MetalAtomicCast: emitInstStmt(inst); break; diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp index 0a5506776..d38c3de9b 100644 --- a/source/slang/slang-emit-metal.cpp +++ b/source/slang/slang-emit-metal.cpp @@ -253,6 +253,24 @@ bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) case kIROp_discard: m_writer->emit("discard_fragment();\n"); return true; + case kIROp_MetalAtomicCast: + { + auto oldValName = getName(inst); + auto op0 = inst->getOperand(0); + + m_writer->emit("atomic_"); + emitType(op0->getDataType()); + m_writer->emit(" "); + m_writer->emit(oldValName); + m_writer->emit(" = "); + + m_writer->emit("((atomic_"); + emitType(op0->getDataType()); + m_writer->emit(")("); + emitOperand(op0, getInfo(EmitOp::General)); + m_writer->emit("));\n"); + return true; + } } return false; } diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h index f639d3343..19117c00e 100644 --- a/source/slang/slang-ir-inst-defs.h +++ b/source/slang/slang-ir-inst-defs.h @@ -685,6 +685,7 @@ INST(GetLegalizedSPIRVGlobalParamAddr, GetLegalizedSPIRVGlobalParamAddr, 1, 0) INST(GetPerVertexInputArray, GetPerVertexInputArray, 1, 0) INST(ForceVarIntoStructTemporarily, ForceVarIntoStructTemporarily, 1, 0) +INST(MetalAtomicCast, MetalAtomicCast, 1, 0) INST(MakeArrayList, makeArrayList, 0, 0) INST(MakeTensorView, makeTensorView, 0, 0) diff --git a/tests/bugs/atomic-coerce.slang b/tests/bugs/atomic-coerce.slang index 2fe927355..bfb0eeb63 100644 --- a/tests/bugs/atomic-coerce.slang +++ b/tests/bugs/atomic-coerce.slang @@ -1,6 +1,6 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj //TEST(compute,vulkan):COMPARE_COMPUTE_EX:-vk -slang -compute -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0 ], stride=4):out,name outputBuffer RWStructuredBuffer<int> outputBuffer; diff --git a/tests/compute/atomics-groupshared.slang b/tests/compute/atomics-groupshared.slang index fcfc9c8d7..a01f7bf6a 100644 --- a/tests/compute/atomics-groupshared.slang +++ b/tests/compute/atomics-groupshared.slang @@ -4,7 +4,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer diff --git a/tests/compute/atomics.slang b/tests/compute/atomics.slang index b00f437f5..ee02c623f 100644 --- a/tests/compute/atomics.slang +++ b/tests/compute/atomics.slang @@ -4,7 +4,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out, name outputBuffer diff --git a/tests/metal/atomic-intrinsics.slang b/tests/metal/atomic-intrinsics.slang new file mode 100644 index 000000000..3533ea2aa --- /dev/null +++ b/tests/metal/atomic-intrinsics.slang @@ -0,0 +1,352 @@ +//TEST:SIMPLE(filecheck=MTL):-target metal -entry computeMain -stage compute -DMETAL +//TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-directly -compute -shaderobj -output-using-type + +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type + + +//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=uintBuffer +RWStructuredBuffer<uint> uintBuffer; +//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=intBuffer +RWStructuredBuffer<int> intBuffer; + +groupshared uint shareMemUI[4]; +groupshared int shareMemI[4]; + +//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<float> outputBuffer; + +[numthreads(4, 1, 1)] +void computeMain(uint groupIndex : SV_GroupIndex) +{ + if (groupIndex == 0) + { + for (int i = 0; i < 4; ++i) + { + shareMemUI[i] = 0U; + shareMemI[i] = 0; + } + } + AllMemoryBarrierWithGroupSync(); + + int idx = groupIndex; + float val = 0.0f; + + // InterlockedAdd + //MTL: atomic_uint threadgroup* {{.*}}shareMemUI + //LIB: call {{.*}}.atomic.local.add.u.i32 + InterlockedAdd(shareMemUI[idx], uint(1)); + val += shareMemUI[idx]; + + //MTL: atomic_int threadgroup* {{.*}}shareMemI + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], 2); + val += shareMemI[idx]; + + //MTL: atomic_uint device* {{.*}}uintBuffer + //LIB: call {{.*}}.atomic.global.add.u.i32 + InterlockedAdd(uintBuffer[idx], 1); + val += uintBuffer[idx]; + + //MTL: atomic_int device* {{.*}}intBuffer + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], 2); + val += intBuffer[idx]; + + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], -1); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], -1); + val += intBuffer[idx]; + + // InterlockedAdd - original_value + uint origui = 0; + //LIB: call {{.*}}.atomic.local.add.u.i32 + InterlockedAdd(shareMemUI[idx], 1, origui); + val += shareMemUI[idx]; + val += origui; + + int origi = 0; + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], 2, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.add.u.i32 + InterlockedAdd(uintBuffer[idx], 1, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], 2, origi); + val += intBuffer[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], -1, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], -1, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedAnd + //LIB: call {{.*}}.atomic.local.and.u.i32 + InterlockedAnd(shareMemUI[idx], 255); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.and.s.i32 + InterlockedAnd(shareMemI[idx], 255); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.and.u.i32 + InterlockedAnd(uintBuffer[idx], 255); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.and.s.i32 + InterlockedAnd(intBuffer[idx], 255); + val += intBuffer[idx]; + + // InterlockedAnd - original_value + //LIB: call {{.*}}.atomic.local.and.u.i32 + InterlockedAnd(shareMemUI[idx], 255, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.and.s.i32 + InterlockedAnd(shareMemI[idx], 255, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.and.u.i32 + InterlockedAnd(uintBuffer[idx], 255, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.and.s.i32 + InterlockedAnd(intBuffer[idx], 255, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedCompareExchange + //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32 + InterlockedCompareExchange(shareMemUI[idx], 1, 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32 + InterlockedCompareExchange(shareMemI[idx], 1, 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32 + InterlockedCompareExchange(uintBuffer[idx], 1, 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32 + InterlockedCompareExchange(intBuffer[idx], 1, 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedCompareStore is not supported by Metal +#if !defined(METAL) + InterlockedCompareStore(shareMemUI[idx], 255, 0); + val += shareMemUI[idx]; + + InterlockedCompareStore(shareMemI[idx], 255, 0); + val += shareMemI[idx]; + + InterlockedCompareStore(uintBuffer[idx], 255, 0); + val += uintBuffer[idx]; + + InterlockedCompareStore(intBuffer[idx], 255, 0); + val += intBuffer[idx]; +#endif + + // InterlockedExchange + //LIB: call {{.*}}.atomic.local.xchg.i32 + InterlockedExchange(shareMemUI[idx], 1, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.xchg.i32 + InterlockedExchange(shareMemI[idx], 1, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.xchg.i32 + InterlockedExchange(uintBuffer[idx], 1, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.xchg.i32 + InterlockedExchange(intBuffer[idx], 1, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedMax + //LIB: call {{.*}}.atomic.local.max.u.i32 + InterlockedMax(shareMemUI[idx], 0); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.max.s.i32 + InterlockedMax(shareMemI[idx], 0); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.max.u.i32 + InterlockedMax(uintBuffer[idx], 0); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.max.s.i32 + InterlockedMax(intBuffer[idx], 0); + val += intBuffer[idx]; + + // InterlockedMax - original_value + //LIB: call {{.*}}.atomic.local.max.u.i32 + InterlockedMax(shareMemUI[idx], 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.max.s.i32 + InterlockedMax(shareMemI[idx], 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.max.u.i32 + InterlockedMax(uintBuffer[idx], 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.max.s.i32 + InterlockedMax(intBuffer[idx], 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedMin + //LIB: call {{.*}}.atomic.local.min.u.i32 + InterlockedMin(shareMemUI[idx], 0); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.min.s.i32 + InterlockedMin(shareMemI[idx], 0); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.min.u.i32 + InterlockedMin(uintBuffer[idx], 0); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.min.s.i32 + InterlockedMin(intBuffer[idx], 0); + val += intBuffer[idx]; + + // InterlockedMin - original_value + //LIB: call {{.*}}.atomic.local.min.u.i32 + InterlockedMin(shareMemUI[idx], 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.min.s.i32 + InterlockedMin(shareMemI[idx], 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.min.u.i32 + InterlockedMin(uintBuffer[idx], 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.min.s.i32 + InterlockedMin(intBuffer[idx], 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedOr + //LIB: call {{.*}}.atomic.local.or.u.i32 + InterlockedOr(shareMemUI[idx], 2); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.or.s.i32 + InterlockedOr(shareMemI[idx], 4); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.or.u.i32 + InterlockedOr(uintBuffer[idx], 6); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.or.s.i32 + InterlockedOr(intBuffer[idx], 8); + val += intBuffer[idx]; + + // InterlockedOr - original_value + //LIB: call {{.*}}.atomic.local.or.u.i32 + InterlockedOr(shareMemUI[idx], 2, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.or.s.i32 + InterlockedOr(shareMemI[idx], 4, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.or.u.i32 + InterlockedOr(uintBuffer[idx], 6, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.or.s.i32 + InterlockedOr(intBuffer[idx], 8, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedXor + //LIB: call {{.*}}.atomic.local.xor.u.i32 + InterlockedXor(shareMemUI[idx], 2); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.xor.s.i32 + InterlockedXor(shareMemI[idx], 4); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.xor.u.i32 + InterlockedXor(uintBuffer[idx], 6); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.xor.s.i32 + InterlockedXor(intBuffer[idx], 8); + val += intBuffer[idx]; + + // InterlockedXor - original_value + //LIB: call {{.*}}.atomic.local.xor.u.i32 + InterlockedXor(shareMemUI[idx], 2, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.xor.s.i32 + InterlockedXor(shareMemI[idx], 4, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.xor.u.i32 + InterlockedXor(uintBuffer[idx], 6, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.xor.s.i32 + InterlockedXor(intBuffer[idx], 8, origi); + val += intBuffer[idx]; + val += origi; + + outputBuffer[idx] = val; +} + +// CHK: 184 +// CHK: 207 +// CHK: 230 +// CHK: 253 |
