diff options
| -rw-r--r-- | source/slang/hlsl.meta.slang | 297 | ||||
| -rw-r--r-- | source/slang/slang-emit-c-like.cpp | 1 | ||||
| -rw-r--r-- | source/slang/slang-emit-metal.cpp | 18 | ||||
| -rw-r--r-- | source/slang/slang-ir-inst-defs.h | 1 | ||||
| -rw-r--r-- | tests/bugs/atomic-coerce.slang | 2 | ||||
| -rw-r--r-- | tests/compute/atomics-groupshared.slang | 2 | ||||
| -rw-r--r-- | tests/compute/atomics.slang | 2 | ||||
| -rw-r--r-- | tests/metal/atomic-intrinsics.slang | 352 |
8 files changed, 598 insertions, 77 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 82ef5837e..597e4dc06 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -4633,7 +4633,7 @@ ${{{{ // Added operations: [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAdd( UINT dest, UINT value, @@ -4644,6 +4644,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedAdd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAdd(buf[dest / 4], value, original_value); @@ -4651,7 +4652,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAdd( UINT dest, UINT value) @@ -4661,6 +4662,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedAdd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAdd(buf[dest / 4], value); @@ -4668,7 +4670,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAnd( UINT dest, UINT value, @@ -4679,6 +4681,7 @@ ${{{{ case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedAnd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAnd(buf[dest / 4], value, original_value); @@ -4686,7 +4689,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedAnd( UINT dest, UINT value) @@ -4696,6 +4699,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedAnd"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedAnd(buf[dest / 4], value); @@ -4703,7 +4707,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedCompareExchange( UINT dest, UINT compare_value, @@ -4715,6 +4719,7 @@ ${{{{ case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))"; case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))"; case hlsl: __intrinsic_asm ".InterlockedCompareExchange"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value); @@ -4740,7 +4745,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedExchange( UINT dest, UINT value, @@ -4751,6 +4756,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedExchange"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedExchange(buf[dest / 4], value, original_value); @@ -4758,7 +4764,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMax( UINT dest, UINT value, @@ -4769,6 +4775,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedMax"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMax(buf[dest / 4], value, original_value); @@ -4776,7 +4783,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMax( UINT dest, UINT value) @@ -4786,6 +4793,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedMax"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMax(buf[dest / 4], value); @@ -4793,7 +4801,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMin( UINT dest, UINT value, @@ -4804,6 +4812,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedMin"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMin(buf[dest / 4], value, original_value); @@ -4811,7 +4820,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedMin( UINT dest, UINT value) @@ -4821,6 +4830,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedMin"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedMin(buf[dest / 4], value); @@ -4828,7 +4838,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedOr( UINT dest, UINT value, @@ -4839,6 +4849,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedOr"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedOr(buf[dest / 4], value, original_value); @@ -4846,7 +4857,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedOr( UINT dest, UINT value) @@ -4856,6 +4867,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedOr"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedOr(buf[dest / 4], value); @@ -4863,7 +4875,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedXor( UINT dest, UINT value, @@ -4874,6 +4886,7 @@ ${{{{ case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))"; case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))"; case hlsl: __intrinsic_asm ".InterlockedXor"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedXor(buf[dest / 4], value, original_value); @@ -4881,7 +4894,7 @@ ${{{{ } [ForceInline] - [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)] + [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)] void InterlockedXor( UINT dest, UINT value) @@ -4891,6 +4904,7 @@ ${{{{ case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)"; case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)"; case hlsl: __intrinsic_asm ".InterlockedXor"; + case metal: case spirv: let buf = __getEquivalentStructuredBuffer<uint>(this); ::InterlockedXor(buf[dest / 4], value); @@ -8596,9 +8610,62 @@ void GroupMemoryBarrierWithGroupSync() // Atomics +__generic<T> +__intrinsic_op($(kIROp_MetalAtomicCast)) +[require(metal)] +T* __getMetalAtomicRef(__ref T x); + +${{{{ +for (const char* fetchAndModify : {"add", "and", "max", "min", "or", "sub", "xor"}) +{ +}}}} + __generic<AtomicType, T> + [ForceInline] + [require(metal)] + void __metalInterlocked_$(fetchAndModify)(AtomicType dest, T value) + { + __intrinsic_asm "atomic_fetch_$(fetchAndModify)_explicit($0, $1, memory_order_relaxed)"; + } + + __generic<AtomicType, T> + [ForceInline] + [require(metal)] + void __metalInterlocked_$(fetchAndModify)(AtomicType dest, T value, out T original_value) + { + __intrinsic_asm "((*($2)) = (($[0])(atomic_fetch_$(fetchAndModify)_explicit($0, $1, memory_order_relaxed))))", T; + } +${{{{ +} // fetchAndModify +}}}} + +__generic<AtomicType, T> [ForceInline] +[require(metal)] +void __metalInterlocked_exchange(AtomicType dest, T value, out T original_value) +{ + __intrinsic_asm "((*($2)) = (($[0])(atomic_exchange_explicit($0, $1, memory_order_relaxed))))", T; +} + +__generic<AtomicType, T> +[ForceInline] +[require(metal)] +void __metalInterlocked_compare_exchange(AtomicType dest, __ref T compare_value, T value) +{ + __intrinsic_asm "atomic_compare_exchange_weak_explicit($0, $1, $2, memory_order_relaxed, memory_order_relaxed)"; +} + +__generic<AtomicType, T> +[ForceInline] +[require(metal)] +void __metalInterlocked_compare_exchange(AtomicType dest, T compare_value, T value, out T original_value) +{ + __metalInterlocked_compare_exchange(dest, compare_value, value); + original_value = compare_value; +} + __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[ForceInline] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAdd(__ref int dest, int value) { __target_switch @@ -8606,6 +8673,9 @@ void InterlockedAdd(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "atomicAdd($0, $1)"; case glsl: __intrinsic_asm "$atomicAdd($A, $1)"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8616,7 +8686,7 @@ void InterlockedAdd(__ref int dest, int value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAdd(__ref uint dest, uint value) { __target_switch @@ -8624,6 +8694,9 @@ void InterlockedAdd(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "atomicAdd((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicAdd($A, $1)"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8640,14 +8713,17 @@ void InterlockedAdd(__ref uint dest, int value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] -void InterlockedAdd(__ref int dest, int value, out int original_value) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +void InterlockedAdd(__ref int dest, int value, out int original_value) { __target_switch { case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "(*$2 = atomicAdd($0, $1))"; case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8659,7 +8735,7 @@ void InterlockedAdd(__ref int dest, int value, out int original_value) [ForceInline] __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAdd(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -8667,6 +8743,9 @@ void InterlockedAdd(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedAdd"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicAdd((int*)$0, $1))"; case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))"; + case metal: + __metalInterlocked_add(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8758,14 +8837,17 @@ void InterlockedAdd(__ref uint64_t dest, uint64_t value, out uint64_t original_v __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] -void InterlockedAnd(__ref int dest, int value) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +void InterlockedAnd(__ref int dest, int value) { __target_switch { case hlsl: __intrinsic_asm "InterlockedAnd"; case cuda: __intrinsic_asm "atomicAnd($0, $1)"; case glsl: __intrinsic_asm "$atomicAnd($A, $1)"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8775,7 +8857,7 @@ void InterlockedAnd(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAnd(__ref uint dest, uint value) { __target_switch @@ -8783,6 +8865,9 @@ void InterlockedAnd(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedAnd"; case cuda: __intrinsic_asm "atomicAnd((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicAnd($A, $1)"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -8792,14 +8877,17 @@ void InterlockedAnd(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] -void InterlockedAnd(__ref int dest, int value, out int original_value) +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] +void InterlockedAnd(__ref int dest, int value, out int original_value) { __target_switch { case hlsl: __intrinsic_asm "InterlockedAnd"; case cuda: __intrinsic_asm "(*$2 = atomicAnd($0, $1))"; case glsl: __intrinsic_asm "($2 = $atomicAnd($A, $1))"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8810,7 +8898,7 @@ void InterlockedAnd(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedAnd(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -8818,6 +8906,9 @@ void InterlockedAnd(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedAnd"; case glsl: __intrinsic_asm "($2 = atomicAnd($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicAnd((int*)$0, $1))"; + case metal: + __metalInterlocked_and(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -8846,7 +8937,7 @@ void InterlockedAnd(__ref uint64_t dest, uint64_t value, out uint64_t origina } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value) { __target_switch @@ -8854,6 +8945,9 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value, case hlsl: __intrinsic_asm "InterlockedCompareExchange"; case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; case cuda: __intrinsic_asm "(*$3 = atomicCAS($0, $1, $2))"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); + return; case spirv: spirv_asm { @@ -8864,7 +8958,7 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value, } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value) { __target_switch @@ -8872,6 +8966,9 @@ void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, case hlsl: __intrinsic_asm "InterlockedCompareExchange"; case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))"; case cuda: __intrinsic_asm "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); + return; case spirv: spirv_asm { @@ -8887,6 +8984,9 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val __target_switch { case hlsl: __intrinsic_asm "InterlockedCompareExchangeFloatBitwise"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value); + return; } } @@ -8896,44 +8996,36 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val __target_switch { case hlsl: __intrinsic_asm "InterlockedCompareExchangeFloatBitwise"; + case metal: + __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value); + return; } } -[ForceInline] -void InterlockedCompareExchange(__ref int64_t dest, int64_t compare_value, int64_t value) -{ - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; - } -} - -[ForceInline] -void InterlockedCompareExchange(__ref int64_t dest, int64_t compare_value, int64_t value, out int64_t original_value) -{ - __target_switch - { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; - } -} - -[ForceInline] -void InterlockedCompareExchange(__ref uint64_t dest, uint64_t compare_value, uint64_t value) +${{{{ +for (const char* T : {"int64_t", "uint64_t"}) { - __target_switch +}}}} + [ForceInline] + void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value) { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + } } -} -[ForceInline] -void InterlockedCompareExchange(__ref uint64_t dest, uint64_t compare_value, uint64_t value, out uint64_t original_value) -{ - __target_switch + [ForceInline] + void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value, out $(T) original_value) { - case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + __target_switch + { + case hlsl: __intrinsic_asm "InterlockedCompareExchange"; + } } -} +${{{{ +} // T +}}}} __glsl_version(430) [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] @@ -8997,7 +9089,7 @@ void InterlockedCompareStore(__ref uint64_t dest, uint64_t compare_value, uint64 } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedExchange(__ref int dest, int value, out int original_value) { __target_switch @@ -9005,6 +9097,9 @@ void InterlockedExchange(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedExchange"; case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicExch($0, $1))"; + case metal: + __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9015,7 +9110,7 @@ void InterlockedExchange(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedExchange(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9023,6 +9118,9 @@ void InterlockedExchange(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedExchange"; case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicExch((int*)$0, $1))"; + case metal: + __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9047,6 +9145,9 @@ void InterlockedExchange(__ref float dest, float value, out float original_va __target_switch { case hlsl: __intrinsic_asm "InterlockedExchange"; + case metal: + __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value); + return; } } @@ -9087,7 +9188,7 @@ void InterlockedExchange(__ref uint64_t dest, uint64_t value, out uint64_t or } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref int dest, int value) { __target_switch @@ -9095,6 +9196,9 @@ void InterlockedMax(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "$atomicMax($A, $1)"; case cuda: __intrinsic_asm "atomicMax($0, $1)"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9104,7 +9208,7 @@ void InterlockedMax(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref uint dest, uint value) { __target_switch @@ -9112,6 +9216,9 @@ void InterlockedMax(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "$atomicMax($A, $1)"; case cuda: __intrinsic_asm "atomicMax((int*)$0, $1)"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9121,7 +9228,7 @@ void InterlockedMax(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref int dest, int value, out int original_value) { __target_switch @@ -9129,6 +9236,9 @@ void InterlockedMax(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicMax($0, $1))"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9139,7 +9249,7 @@ void InterlockedMax(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMax(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9147,6 +9257,9 @@ void InterlockedMax(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedMax"; case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicMax((int*)$0, $1))"; + case metal: + __metalInterlocked_max(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9193,7 +9306,7 @@ void InterlockedMax(__ref uint64_t dest, uint64_t value, out uint64_t origina } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref int dest, int value) { __target_switch @@ -9201,6 +9314,9 @@ void InterlockedMin(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "$atomicMin($A, $1)"; case cuda: __intrinsic_asm "atomicMin($0, $1)"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9210,7 +9326,7 @@ void InterlockedMin(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref uint dest, uint value) { __target_switch @@ -9218,6 +9334,9 @@ void InterlockedMin(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "$atomicMin($A, $1)"; case cuda: __intrinsic_asm "atomicMin((int*)$0, $1)"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9227,7 +9346,7 @@ void InterlockedMin(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref int dest, int value, out int original_value) { __target_switch @@ -9235,6 +9354,9 @@ void InterlockedMin(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicMin($0, $1))"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9245,7 +9367,7 @@ void InterlockedMin(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedMin(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9253,6 +9375,9 @@ void InterlockedMin(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedMin"; case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicMin((int*)$0, $1))"; + case metal: + __metalInterlocked_min(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9299,7 +9424,7 @@ void InterlockedMin(__ref uint64_t dest, uint64_t value, out uint64_t origina } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref int dest, int value) { __target_switch @@ -9307,6 +9432,9 @@ void InterlockedOr(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedOr"; case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicOr($A, $1)"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9316,7 +9444,7 @@ void InterlockedOr(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref uint dest, uint value) { __target_switch @@ -9324,6 +9452,9 @@ void InterlockedOr(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedOr"; case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicOr($A, $1)"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9333,7 +9464,7 @@ void InterlockedOr(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref int dest, int value, out int original_value) { __target_switch @@ -9341,6 +9472,9 @@ void InterlockedOr(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedOr"; case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicOr($0, $1))"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9351,7 +9485,7 @@ void InterlockedOr(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedOr(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9359,6 +9493,9 @@ void InterlockedOr(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedOr"; case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicOr((int*)$0, $1))"; + case metal: + __metalInterlocked_or(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9387,7 +9524,7 @@ void InterlockedOr(__ref uint64_t dest, uint64_t value, out uint64_t original } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref int dest, int value) { __target_switch @@ -9395,6 +9532,9 @@ void InterlockedXor(__ref int dest, int value) case hlsl: __intrinsic_asm "InterlockedXor"; case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicXor($A, $1)"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9404,7 +9544,7 @@ void InterlockedXor(__ref int dest, int value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref uint dest, uint value) { __target_switch @@ -9412,6 +9552,9 @@ void InterlockedXor(__ref uint dest, uint value) case hlsl: __intrinsic_asm "InterlockedXor"; case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)"; case glsl: __intrinsic_asm "$atomicXor($A, $1)"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value); + return; case spirv: spirv_asm { @@ -9421,7 +9564,7 @@ void InterlockedXor(__ref uint dest, uint value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref int dest, int value, out int original_value) { __target_switch @@ -9429,6 +9572,9 @@ void InterlockedXor(__ref int dest, int value, out int original_value) case hlsl: __intrinsic_asm "InterlockedXor"; case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))"; case cuda: __intrinsic_asm "(*$2 = atomicXor($0, $1))"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { @@ -9439,7 +9585,7 @@ void InterlockedXor(__ref int dest, int value, out int original_value) } __glsl_version(430) -[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)] +[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)] void InterlockedXor(__ref uint dest, uint value, out uint original_value) { __target_switch @@ -9447,6 +9593,9 @@ void InterlockedXor(__ref uint dest, uint value, out uint original_value) case hlsl: __intrinsic_asm "InterlockedXor"; case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))"; case cuda: __intrinsic_asm "(*$2 = (uint)atomicXor((int*)$0, $1))"; + case metal: + __metalInterlocked_xor(__getMetalAtomicRef(dest), value, original_value); + return; case spirv: spirv_asm { diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp index e9ab58bca..6062875b3 100644 --- a/source/slang/slang-emit-c-like.cpp +++ b/source/slang/slang-emit-c-like.cpp @@ -2889,6 +2889,7 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst) case kIROp_AtomicCounterIncrement: case kIROp_AtomicCounterDecrement: case kIROp_StructuredBufferGetDimensions: + case kIROp_MetalAtomicCast: emitInstStmt(inst); break; diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp index 0a5506776..d38c3de9b 100644 --- a/source/slang/slang-emit-metal.cpp +++ b/source/slang/slang-emit-metal.cpp @@ -253,6 +253,24 @@ bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst) case kIROp_discard: m_writer->emit("discard_fragment();\n"); return true; + case kIROp_MetalAtomicCast: + { + auto oldValName = getName(inst); + auto op0 = inst->getOperand(0); + + m_writer->emit("atomic_"); + emitType(op0->getDataType()); + m_writer->emit(" "); + m_writer->emit(oldValName); + m_writer->emit(" = "); + + m_writer->emit("((atomic_"); + emitType(op0->getDataType()); + m_writer->emit(")("); + emitOperand(op0, getInfo(EmitOp::General)); + m_writer->emit("));\n"); + return true; + } } return false; } diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h index f639d3343..19117c00e 100644 --- a/source/slang/slang-ir-inst-defs.h +++ b/source/slang/slang-ir-inst-defs.h @@ -685,6 +685,7 @@ INST(GetLegalizedSPIRVGlobalParamAddr, GetLegalizedSPIRVGlobalParamAddr, 1, 0) INST(GetPerVertexInputArray, GetPerVertexInputArray, 1, 0) INST(ForceVarIntoStructTemporarily, ForceVarIntoStructTemporarily, 1, 0) +INST(MetalAtomicCast, MetalAtomicCast, 1, 0) INST(MakeArrayList, makeArrayList, 0, 0) INST(MakeTensorView, makeTensorView, 0, 0) diff --git a/tests/bugs/atomic-coerce.slang b/tests/bugs/atomic-coerce.slang index 2fe927355..bfb0eeb63 100644 --- a/tests/bugs/atomic-coerce.slang +++ b/tests/bugs/atomic-coerce.slang @@ -1,6 +1,6 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj //TEST(compute,vulkan):COMPARE_COMPUTE_EX:-vk -slang -compute -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0 ], stride=4):out,name outputBuffer RWStructuredBuffer<int> outputBuffer; diff --git a/tests/compute/atomics-groupshared.slang b/tests/compute/atomics-groupshared.slang index fcfc9c8d7..a01f7bf6a 100644 --- a/tests/compute/atomics-groupshared.slang +++ b/tests/compute/atomics-groupshared.slang @@ -4,7 +4,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer diff --git a/tests/compute/atomics.slang b/tests/compute/atomics.slang index b00f437f5..ee02c623f 100644 --- a/tests/compute/atomics.slang +++ b/tests/compute/atomics.slang @@ -4,7 +4,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out, name outputBuffer diff --git a/tests/metal/atomic-intrinsics.slang b/tests/metal/atomic-intrinsics.slang new file mode 100644 index 000000000..3533ea2aa --- /dev/null +++ b/tests/metal/atomic-intrinsics.slang @@ -0,0 +1,352 @@ +//TEST:SIMPLE(filecheck=MTL):-target metal -entry computeMain -stage compute -DMETAL +//TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-directly -compute -shaderobj -output-using-type + +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type + + +//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=uintBuffer +RWStructuredBuffer<uint> uintBuffer; +//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=intBuffer +RWStructuredBuffer<int> intBuffer; + +groupshared uint shareMemUI[4]; +groupshared int shareMemI[4]; + +//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<float> outputBuffer; + +[numthreads(4, 1, 1)] +void computeMain(uint groupIndex : SV_GroupIndex) +{ + if (groupIndex == 0) + { + for (int i = 0; i < 4; ++i) + { + shareMemUI[i] = 0U; + shareMemI[i] = 0; + } + } + AllMemoryBarrierWithGroupSync(); + + int idx = groupIndex; + float val = 0.0f; + + // InterlockedAdd + //MTL: atomic_uint threadgroup* {{.*}}shareMemUI + //LIB: call {{.*}}.atomic.local.add.u.i32 + InterlockedAdd(shareMemUI[idx], uint(1)); + val += shareMemUI[idx]; + + //MTL: atomic_int threadgroup* {{.*}}shareMemI + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], 2); + val += shareMemI[idx]; + + //MTL: atomic_uint device* {{.*}}uintBuffer + //LIB: call {{.*}}.atomic.global.add.u.i32 + InterlockedAdd(uintBuffer[idx], 1); + val += uintBuffer[idx]; + + //MTL: atomic_int device* {{.*}}intBuffer + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], 2); + val += intBuffer[idx]; + + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], -1); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], -1); + val += intBuffer[idx]; + + // InterlockedAdd - original_value + uint origui = 0; + //LIB: call {{.*}}.atomic.local.add.u.i32 + InterlockedAdd(shareMemUI[idx], 1, origui); + val += shareMemUI[idx]; + val += origui; + + int origi = 0; + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], 2, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.add.u.i32 + InterlockedAdd(uintBuffer[idx], 1, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], 2, origi); + val += intBuffer[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], -1, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], -1, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedAnd + //LIB: call {{.*}}.atomic.local.and.u.i32 + InterlockedAnd(shareMemUI[idx], 255); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.and.s.i32 + InterlockedAnd(shareMemI[idx], 255); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.and.u.i32 + InterlockedAnd(uintBuffer[idx], 255); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.and.s.i32 + InterlockedAnd(intBuffer[idx], 255); + val += intBuffer[idx]; + + // InterlockedAnd - original_value + //LIB: call {{.*}}.atomic.local.and.u.i32 + InterlockedAnd(shareMemUI[idx], 255, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.and.s.i32 + InterlockedAnd(shareMemI[idx], 255, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.and.u.i32 + InterlockedAnd(uintBuffer[idx], 255, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.and.s.i32 + InterlockedAnd(intBuffer[idx], 255, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedCompareExchange + //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32 + InterlockedCompareExchange(shareMemUI[idx], 1, 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32 + InterlockedCompareExchange(shareMemI[idx], 1, 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32 + InterlockedCompareExchange(uintBuffer[idx], 1, 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32 + InterlockedCompareExchange(intBuffer[idx], 1, 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedCompareStore is not supported by Metal +#if !defined(METAL) + InterlockedCompareStore(shareMemUI[idx], 255, 0); + val += shareMemUI[idx]; + + InterlockedCompareStore(shareMemI[idx], 255, 0); + val += shareMemI[idx]; + + InterlockedCompareStore(uintBuffer[idx], 255, 0); + val += uintBuffer[idx]; + + InterlockedCompareStore(intBuffer[idx], 255, 0); + val += intBuffer[idx]; +#endif + + // InterlockedExchange + //LIB: call {{.*}}.atomic.local.xchg.i32 + InterlockedExchange(shareMemUI[idx], 1, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.xchg.i32 + InterlockedExchange(shareMemI[idx], 1, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.xchg.i32 + InterlockedExchange(uintBuffer[idx], 1, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.xchg.i32 + InterlockedExchange(intBuffer[idx], 1, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedMax + //LIB: call {{.*}}.atomic.local.max.u.i32 + InterlockedMax(shareMemUI[idx], 0); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.max.s.i32 + InterlockedMax(shareMemI[idx], 0); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.max.u.i32 + InterlockedMax(uintBuffer[idx], 0); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.max.s.i32 + InterlockedMax(intBuffer[idx], 0); + val += intBuffer[idx]; + + // InterlockedMax - original_value + //LIB: call {{.*}}.atomic.local.max.u.i32 + InterlockedMax(shareMemUI[idx], 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.max.s.i32 + InterlockedMax(shareMemI[idx], 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.max.u.i32 + InterlockedMax(uintBuffer[idx], 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.max.s.i32 + InterlockedMax(intBuffer[idx], 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedMin + //LIB: call {{.*}}.atomic.local.min.u.i32 + InterlockedMin(shareMemUI[idx], 0); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.min.s.i32 + InterlockedMin(shareMemI[idx], 0); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.min.u.i32 + InterlockedMin(uintBuffer[idx], 0); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.min.s.i32 + InterlockedMin(intBuffer[idx], 0); + val += intBuffer[idx]; + + // InterlockedMin - original_value + //LIB: call {{.*}}.atomic.local.min.u.i32 + InterlockedMin(shareMemUI[idx], 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.min.s.i32 + InterlockedMin(shareMemI[idx], 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.min.u.i32 + InterlockedMin(uintBuffer[idx], 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.min.s.i32 + InterlockedMin(intBuffer[idx], 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedOr + //LIB: call {{.*}}.atomic.local.or.u.i32 + InterlockedOr(shareMemUI[idx], 2); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.or.s.i32 + InterlockedOr(shareMemI[idx], 4); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.or.u.i32 + InterlockedOr(uintBuffer[idx], 6); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.or.s.i32 + InterlockedOr(intBuffer[idx], 8); + val += intBuffer[idx]; + + // InterlockedOr - original_value + //LIB: call {{.*}}.atomic.local.or.u.i32 + InterlockedOr(shareMemUI[idx], 2, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.or.s.i32 + InterlockedOr(shareMemI[idx], 4, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.or.u.i32 + InterlockedOr(uintBuffer[idx], 6, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.or.s.i32 + InterlockedOr(intBuffer[idx], 8, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedXor + //LIB: call {{.*}}.atomic.local.xor.u.i32 + InterlockedXor(shareMemUI[idx], 2); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.xor.s.i32 + InterlockedXor(shareMemI[idx], 4); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.xor.u.i32 + InterlockedXor(uintBuffer[idx], 6); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.xor.s.i32 + InterlockedXor(intBuffer[idx], 8); + val += intBuffer[idx]; + + // InterlockedXor - original_value + //LIB: call {{.*}}.atomic.local.xor.u.i32 + InterlockedXor(shareMemUI[idx], 2, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.xor.s.i32 + InterlockedXor(shareMemI[idx], 4, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.xor.u.i32 + InterlockedXor(uintBuffer[idx], 6, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.xor.s.i32 + InterlockedXor(intBuffer[idx], 8, origi); + val += intBuffer[idx]; + val += origi; + + outputBuffer[idx] = val; +} + +// CHK: 184 +// CHK: 207 +// CHK: 230 +// CHK: 253 |
