summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--source/slang/hlsl.meta.slang297
-rw-r--r--source/slang/slang-emit-c-like.cpp1
-rw-r--r--source/slang/slang-emit-metal.cpp18
-rw-r--r--source/slang/slang-ir-inst-defs.h1
-rw-r--r--tests/bugs/atomic-coerce.slang2
-rw-r--r--tests/compute/atomics-groupshared.slang2
-rw-r--r--tests/compute/atomics.slang2
-rw-r--r--tests/metal/atomic-intrinsics.slang352
8 files changed, 598 insertions, 77 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 82ef5837e..597e4dc06 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -4633,7 +4633,7 @@ ${{{{
// Added operations:
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedAdd(
UINT dest,
UINT value,
@@ -4644,6 +4644,7 @@ ${{{{
case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))";
case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedAdd";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedAdd(buf[dest / 4], value, original_value);
@@ -4651,7 +4652,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedAdd(
UINT dest,
UINT value)
@@ -4661,6 +4662,7 @@ ${{{{
case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)";
case hlsl: __intrinsic_asm ".InterlockedAdd";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedAdd(buf[dest / 4], value);
@@ -4668,7 +4670,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedAnd(
UINT dest,
UINT value,
@@ -4679,6 +4681,7 @@ ${{{{
case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedAnd";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedAnd(buf[dest / 4], value, original_value);
@@ -4686,7 +4689,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedAnd(
UINT dest,
UINT value)
@@ -4696,6 +4699,7 @@ ${{{{
case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)";
case hlsl: __intrinsic_asm ".InterlockedAnd";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedAnd(buf[dest / 4], value);
@@ -4703,7 +4707,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedCompareExchange(
UINT dest,
UINT compare_value,
@@ -4715,6 +4719,7 @@ ${{{{
case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))";
case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))";
case hlsl: __intrinsic_asm ".InterlockedCompareExchange";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value);
@@ -4740,7 +4745,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedExchange(
UINT dest,
UINT value,
@@ -4751,6 +4756,7 @@ ${{{{
case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))";
case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedExchange";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedExchange(buf[dest / 4], value, original_value);
@@ -4758,7 +4764,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedMax(
UINT dest,
UINT value,
@@ -4769,6 +4775,7 @@ ${{{{
case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))";
case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedMax";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedMax(buf[dest / 4], value, original_value);
@@ -4776,7 +4783,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedMax(
UINT dest,
UINT value)
@@ -4786,6 +4793,7 @@ ${{{{
case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)";
case hlsl: __intrinsic_asm ".InterlockedMax";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedMax(buf[dest / 4], value);
@@ -4793,7 +4801,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedMin(
UINT dest,
UINT value,
@@ -4804,6 +4812,7 @@ ${{{{
case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))";
case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedMin";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedMin(buf[dest / 4], value, original_value);
@@ -4811,7 +4820,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedMin(
UINT dest,
UINT value)
@@ -4821,6 +4830,7 @@ ${{{{
case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)";
case hlsl: __intrinsic_asm ".InterlockedMin";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedMin(buf[dest / 4], value);
@@ -4828,7 +4838,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedOr(
UINT dest,
UINT value,
@@ -4839,6 +4849,7 @@ ${{{{
case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))";
case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedOr";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedOr(buf[dest / 4], value, original_value);
@@ -4846,7 +4857,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedOr(
UINT dest,
UINT value)
@@ -4856,6 +4867,7 @@ ${{{{
case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)";
case hlsl: __intrinsic_asm ".InterlockedOr";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedOr(buf[dest / 4], value);
@@ -4863,7 +4875,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedXor(
UINT dest,
UINT value,
@@ -4874,6 +4886,7 @@ ${{{{
case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))";
case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))";
case hlsl: __intrinsic_asm ".InterlockedXor";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedXor(buf[dest / 4], value, original_value);
@@ -4881,7 +4894,7 @@ ${{{{
}
[ForceInline]
- [require(cuda_glsl_hlsl_spirv, byteaddressbuffer_rw)]
+ [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
void InterlockedXor(
UINT dest,
UINT value)
@@ -4891,6 +4904,7 @@ ${{{{
case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)";
case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)";
case hlsl: __intrinsic_asm ".InterlockedXor";
+ case metal:
case spirv:
let buf = __getEquivalentStructuredBuffer<uint>(this);
::InterlockedXor(buf[dest / 4], value);
@@ -8596,9 +8610,62 @@ void GroupMemoryBarrierWithGroupSync()
// Atomics
+__generic<T>
+__intrinsic_op($(kIROp_MetalAtomicCast))
+[require(metal)]
+T* __getMetalAtomicRef(__ref T x);
+
+${{{{
+for (const char* fetchAndModify : {"add", "and", "max", "min", "or", "sub", "xor"})
+{
+}}}}
+ __generic<AtomicType, T>
+ [ForceInline]
+ [require(metal)]
+ void __metalInterlocked_$(fetchAndModify)(AtomicType dest, T value)
+ {
+ __intrinsic_asm "atomic_fetch_$(fetchAndModify)_explicit($0, $1, memory_order_relaxed)";
+ }
+
+ __generic<AtomicType, T>
+ [ForceInline]
+ [require(metal)]
+ void __metalInterlocked_$(fetchAndModify)(AtomicType dest, T value, out T original_value)
+ {
+ __intrinsic_asm "((*($2)) = (($[0])(atomic_fetch_$(fetchAndModify)_explicit($0, $1, memory_order_relaxed))))", T;
+ }
+${{{{
+} // fetchAndModify
+}}}}
+
+__generic<AtomicType, T>
[ForceInline]
+[require(metal)]
+void __metalInterlocked_exchange(AtomicType dest, T value, out T original_value)
+{
+ __intrinsic_asm "((*($2)) = (($[0])(atomic_exchange_explicit($0, $1, memory_order_relaxed))))", T;
+}
+
+__generic<AtomicType, T>
+[ForceInline]
+[require(metal)]
+void __metalInterlocked_compare_exchange(AtomicType dest, __ref T compare_value, T value)
+{
+ __intrinsic_asm "atomic_compare_exchange_weak_explicit($0, $1, $2, memory_order_relaxed, memory_order_relaxed)";
+}
+
+__generic<AtomicType, T>
+[ForceInline]
+[require(metal)]
+void __metalInterlocked_compare_exchange(AtomicType dest, T compare_value, T value, out T original_value)
+{
+ __metalInterlocked_compare_exchange(dest, compare_value, value);
+ original_value = compare_value;
+}
+
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[ForceInline]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedAdd(__ref int dest, int value)
{
__target_switch
@@ -8606,6 +8673,9 @@ void InterlockedAdd(__ref int dest, int value)
case hlsl: __intrinsic_asm "InterlockedAdd";
case cuda: __intrinsic_asm "atomicAdd($0, $1)";
case glsl: __intrinsic_asm "$atomicAdd($A, $1)";
+ case metal:
+ __metalInterlocked_add(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -8616,7 +8686,7 @@ void InterlockedAdd(__ref int dest, int value)
[ForceInline]
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedAdd(__ref uint dest, uint value)
{
__target_switch
@@ -8624,6 +8694,9 @@ void InterlockedAdd(__ref uint dest, uint value)
case hlsl: __intrinsic_asm "InterlockedAdd";
case cuda: __intrinsic_asm "atomicAdd((int*)$0, $1)";
case glsl: __intrinsic_asm "$atomicAdd($A, $1)";
+ case metal:
+ __metalInterlocked_add(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -8640,14 +8713,17 @@ void InterlockedAdd(__ref uint dest, int value)
[ForceInline]
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
-void InterlockedAdd(__ref int dest, int value, out int original_value)
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
+void InterlockedAdd(__ref int dest, int value, out int original_value)
{
__target_switch
{
case hlsl: __intrinsic_asm "InterlockedAdd";
case cuda: __intrinsic_asm "(*$2 = atomicAdd($0, $1))";
case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))";
+ case metal:
+ __metalInterlocked_add(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -8659,7 +8735,7 @@ void InterlockedAdd(__ref int dest, int value, out int original_value)
[ForceInline]
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedAdd(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -8667,6 +8743,9 @@ void InterlockedAdd(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedAdd";
case cuda: __intrinsic_asm "(*$2 = (uint)atomicAdd((int*)$0, $1))";
case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))";
+ case metal:
+ __metalInterlocked_add(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -8758,14 +8837,17 @@ void InterlockedAdd(__ref uint64_t dest, uint64_t value, out uint64_t original_v
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
-void InterlockedAnd(__ref int dest, int value)
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
+void InterlockedAnd(__ref int dest, int value)
{
__target_switch
{
case hlsl: __intrinsic_asm "InterlockedAnd";
case cuda: __intrinsic_asm "atomicAnd($0, $1)";
case glsl: __intrinsic_asm "$atomicAnd($A, $1)";
+ case metal:
+ __metalInterlocked_and(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -8775,7 +8857,7 @@ void InterlockedAnd(__ref int dest, int value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedAnd(__ref uint dest, uint value)
{
__target_switch
@@ -8783,6 +8865,9 @@ void InterlockedAnd(__ref uint dest, uint value)
case hlsl: __intrinsic_asm "InterlockedAnd";
case cuda: __intrinsic_asm "atomicAnd((int*)$0, $1)";
case glsl: __intrinsic_asm "$atomicAnd($A, $1)";
+ case metal:
+ __metalInterlocked_and(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -8792,14 +8877,17 @@ void InterlockedAnd(__ref uint dest, uint value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
-void InterlockedAnd(__ref int dest, int value, out int original_value)
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
+void InterlockedAnd(__ref int dest, int value, out int original_value)
{
__target_switch
{
case hlsl: __intrinsic_asm "InterlockedAnd";
case cuda: __intrinsic_asm "(*$2 = atomicAnd($0, $1))";
case glsl: __intrinsic_asm "($2 = $atomicAnd($A, $1))";
+ case metal:
+ __metalInterlocked_and(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -8810,7 +8898,7 @@ void InterlockedAnd(__ref int dest, int value, out int original_value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedAnd(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -8818,6 +8906,9 @@ void InterlockedAnd(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedAnd";
case glsl: __intrinsic_asm "($2 = atomicAnd($0, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicAnd((int*)$0, $1))";
+ case metal:
+ __metalInterlocked_and(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -8846,7 +8937,7 @@ void InterlockedAnd(__ref uint64_t dest, uint64_t value, out uint64_t origina
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value)
{
__target_switch
@@ -8854,6 +8945,9 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value,
case hlsl: __intrinsic_asm "InterlockedCompareExchange";
case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
case cuda: __intrinsic_asm "(*$3 = atomicCAS($0, $1, $2))";
+ case metal:
+ __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -8864,7 +8958,7 @@ void InterlockedCompareExchange(__ref int dest, int compare_value, int value,
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value)
{
__target_switch
@@ -8872,6 +8966,9 @@ void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value,
case hlsl: __intrinsic_asm "InterlockedCompareExchange";
case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
case cuda: __intrinsic_asm "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))";
+ case metal:
+ __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -8887,6 +8984,9 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val
__target_switch
{
case hlsl: __intrinsic_asm "InterlockedCompareExchangeFloatBitwise";
+ case metal:
+ __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value);
+ return;
}
}
@@ -8896,44 +8996,36 @@ void InterlockedCompareExchangeFloatBitwise(__ref float dest, float compare_val
__target_switch
{
case hlsl: __intrinsic_asm "InterlockedCompareExchangeFloatBitwise";
+ case metal:
+ __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value);
+ return;
}
}
-[ForceInline]
-void InterlockedCompareExchange(__ref int64_t dest, int64_t compare_value, int64_t value)
-{
- __target_switch
- {
- case hlsl: __intrinsic_asm "InterlockedCompareExchange";
- }
-}
-
-[ForceInline]
-void InterlockedCompareExchange(__ref int64_t dest, int64_t compare_value, int64_t value, out int64_t original_value)
-{
- __target_switch
- {
- case hlsl: __intrinsic_asm "InterlockedCompareExchange";
- }
-}
-
-[ForceInline]
-void InterlockedCompareExchange(__ref uint64_t dest, uint64_t compare_value, uint64_t value)
+${{{{
+for (const char* T : {"int64_t", "uint64_t"})
{
- __target_switch
+}}}}
+ [ForceInline]
+ void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value)
{
- case hlsl: __intrinsic_asm "InterlockedCompareExchange";
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedCompareExchange";
+ }
}
-}
-[ForceInline]
-void InterlockedCompareExchange(__ref uint64_t dest, uint64_t compare_value, uint64_t value, out uint64_t original_value)
-{
- __target_switch
+ [ForceInline]
+ void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value, out $(T) original_value)
{
- case hlsl: __intrinsic_asm "InterlockedCompareExchange";
+ __target_switch
+ {
+ case hlsl: __intrinsic_asm "InterlockedCompareExchange";
+ }
}
-}
+${{{{
+} // T
+}}}}
__glsl_version(430)
[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
@@ -8997,7 +9089,7 @@ void InterlockedCompareStore(__ref uint64_t dest, uint64_t compare_value, uint64
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedExchange(__ref int dest, int value, out int original_value)
{
__target_switch
@@ -9005,6 +9097,9 @@ void InterlockedExchange(__ref int dest, int value, out int original_value)
case hlsl: __intrinsic_asm "InterlockedExchange";
case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicExch($0, $1))";
+ case metal:
+ __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9015,7 +9110,7 @@ void InterlockedExchange(__ref int dest, int value, out int original_value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedExchange(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -9023,6 +9118,9 @@ void InterlockedExchange(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedExchange";
case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))";
case cuda: __intrinsic_asm "(*$2 = (uint)atomicExch((int*)$0, $1))";
+ case metal:
+ __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9047,6 +9145,9 @@ void InterlockedExchange(__ref float dest, float value, out float original_va
__target_switch
{
case hlsl: __intrinsic_asm "InterlockedExchange";
+ case metal:
+ __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value);
+ return;
}
}
@@ -9087,7 +9188,7 @@ void InterlockedExchange(__ref uint64_t dest, uint64_t value, out uint64_t or
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMax(__ref int dest, int value)
{
__target_switch
@@ -9095,6 +9196,9 @@ void InterlockedMax(__ref int dest, int value)
case hlsl: __intrinsic_asm "InterlockedMax";
case glsl: __intrinsic_asm "$atomicMax($A, $1)";
case cuda: __intrinsic_asm "atomicMax($0, $1)";
+ case metal:
+ __metalInterlocked_max(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9104,7 +9208,7 @@ void InterlockedMax(__ref int dest, int value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMax(__ref uint dest, uint value)
{
__target_switch
@@ -9112,6 +9216,9 @@ void InterlockedMax(__ref uint dest, uint value)
case hlsl: __intrinsic_asm "InterlockedMax";
case glsl: __intrinsic_asm "$atomicMax($A, $1)";
case cuda: __intrinsic_asm "atomicMax((int*)$0, $1)";
+ case metal:
+ __metalInterlocked_max(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9121,7 +9228,7 @@ void InterlockedMax(__ref uint dest, uint value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMax(__ref int dest, int value, out int original_value)
{
__target_switch
@@ -9129,6 +9236,9 @@ void InterlockedMax(__ref int dest, int value, out int original_value)
case hlsl: __intrinsic_asm "InterlockedMax";
case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicMax($0, $1))";
+ case metal:
+ __metalInterlocked_max(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9139,7 +9249,7 @@ void InterlockedMax(__ref int dest, int value, out int original_value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMax(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -9147,6 +9257,9 @@ void InterlockedMax(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedMax";
case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))";
case cuda: __intrinsic_asm "(*$2 = (uint)atomicMax((int*)$0, $1))";
+ case metal:
+ __metalInterlocked_max(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9193,7 +9306,7 @@ void InterlockedMax(__ref uint64_t dest, uint64_t value, out uint64_t origina
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMin(__ref int dest, int value)
{
__target_switch
@@ -9201,6 +9314,9 @@ void InterlockedMin(__ref int dest, int value)
case hlsl: __intrinsic_asm "InterlockedMin";
case glsl: __intrinsic_asm "$atomicMin($A, $1)";
case cuda: __intrinsic_asm "atomicMin($0, $1)";
+ case metal:
+ __metalInterlocked_min(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9210,7 +9326,7 @@ void InterlockedMin(__ref int dest, int value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMin(__ref uint dest, uint value)
{
__target_switch
@@ -9218,6 +9334,9 @@ void InterlockedMin(__ref uint dest, uint value)
case hlsl: __intrinsic_asm "InterlockedMin";
case glsl: __intrinsic_asm "$atomicMin($A, $1)";
case cuda: __intrinsic_asm "atomicMin((int*)$0, $1)";
+ case metal:
+ __metalInterlocked_min(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9227,7 +9346,7 @@ void InterlockedMin(__ref uint dest, uint value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMin(__ref int dest, int value, out int original_value)
{
__target_switch
@@ -9235,6 +9354,9 @@ void InterlockedMin(__ref int dest, int value, out int original_value)
case hlsl: __intrinsic_asm "InterlockedMin";
case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicMin($0, $1))";
+ case metal:
+ __metalInterlocked_min(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9245,7 +9367,7 @@ void InterlockedMin(__ref int dest, int value, out int original_value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedMin(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -9253,6 +9375,9 @@ void InterlockedMin(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedMin";
case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))";
case cuda: __intrinsic_asm "(*$2 = (uint)atomicMin((int*)$0, $1))";
+ case metal:
+ __metalInterlocked_min(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9299,7 +9424,7 @@ void InterlockedMin(__ref uint64_t dest, uint64_t value, out uint64_t origina
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedOr(__ref int dest, int value)
{
__target_switch
@@ -9307,6 +9432,9 @@ void InterlockedOr(__ref int dest, int value)
case hlsl: __intrinsic_asm "InterlockedOr";
case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)";
case glsl: __intrinsic_asm "$atomicOr($A, $1)";
+ case metal:
+ __metalInterlocked_or(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9316,7 +9444,7 @@ void InterlockedOr(__ref int dest, int value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedOr(__ref uint dest, uint value)
{
__target_switch
@@ -9324,6 +9452,9 @@ void InterlockedOr(__ref uint dest, uint value)
case hlsl: __intrinsic_asm "InterlockedOr";
case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)";
case glsl: __intrinsic_asm "$atomicOr($A, $1)";
+ case metal:
+ __metalInterlocked_or(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9333,7 +9464,7 @@ void InterlockedOr(__ref uint dest, uint value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedOr(__ref int dest, int value, out int original_value)
{
__target_switch
@@ -9341,6 +9472,9 @@ void InterlockedOr(__ref int dest, int value, out int original_value)
case hlsl: __intrinsic_asm "InterlockedOr";
case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicOr($0, $1))";
+ case metal:
+ __metalInterlocked_or(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9351,7 +9485,7 @@ void InterlockedOr(__ref int dest, int value, out int original_value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedOr(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -9359,6 +9493,9 @@ void InterlockedOr(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedOr";
case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicOr((int*)$0, $1))";
+ case metal:
+ __metalInterlocked_or(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9387,7 +9524,7 @@ void InterlockedOr(__ref uint64_t dest, uint64_t value, out uint64_t original
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedXor(__ref int dest, int value)
{
__target_switch
@@ -9395,6 +9532,9 @@ void InterlockedXor(__ref int dest, int value)
case hlsl: __intrinsic_asm "InterlockedXor";
case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)";
case glsl: __intrinsic_asm "$atomicXor($A, $1)";
+ case metal:
+ __metalInterlocked_xor(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9404,7 +9544,7 @@ void InterlockedXor(__ref int dest, int value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedXor(__ref uint dest, uint value)
{
__target_switch
@@ -9412,6 +9552,9 @@ void InterlockedXor(__ref uint dest, uint value)
case hlsl: __intrinsic_asm "InterlockedXor";
case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)";
case glsl: __intrinsic_asm "$atomicXor($A, $1)";
+ case metal:
+ __metalInterlocked_xor(__getMetalAtomicRef(dest), value);
+ return;
case spirv:
spirv_asm
{
@@ -9421,7 +9564,7 @@ void InterlockedXor(__ref uint dest, uint value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedXor(__ref int dest, int value, out int original_value)
{
__target_switch
@@ -9429,6 +9572,9 @@ void InterlockedXor(__ref int dest, int value, out int original_value)
case hlsl: __intrinsic_asm "InterlockedXor";
case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))";
case cuda: __intrinsic_asm "(*$2 = atomicXor($0, $1))";
+ case metal:
+ __metalInterlocked_xor(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
@@ -9439,7 +9585,7 @@ void InterlockedXor(__ref int dest, int value, out int original_value)
}
__glsl_version(430)
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda)]
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda)]
void InterlockedXor(__ref uint dest, uint value, out uint original_value)
{
__target_switch
@@ -9447,6 +9593,9 @@ void InterlockedXor(__ref uint dest, uint value, out uint original_value)
case hlsl: __intrinsic_asm "InterlockedXor";
case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))";
case cuda: __intrinsic_asm "(*$2 = (uint)atomicXor((int*)$0, $1))";
+ case metal:
+ __metalInterlocked_xor(__getMetalAtomicRef(dest), value, original_value);
+ return;
case spirv:
spirv_asm
{
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index e9ab58bca..6062875b3 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -2889,6 +2889,7 @@ void CLikeSourceEmitter::_emitInst(IRInst* inst)
case kIROp_AtomicCounterIncrement:
case kIROp_AtomicCounterDecrement:
case kIROp_StructuredBufferGetDimensions:
+ case kIROp_MetalAtomicCast:
emitInstStmt(inst);
break;
diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp
index 0a5506776..d38c3de9b 100644
--- a/source/slang/slang-emit-metal.cpp
+++ b/source/slang/slang-emit-metal.cpp
@@ -253,6 +253,24 @@ bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
case kIROp_discard:
m_writer->emit("discard_fragment();\n");
return true;
+ case kIROp_MetalAtomicCast:
+ {
+ auto oldValName = getName(inst);
+ auto op0 = inst->getOperand(0);
+
+ m_writer->emit("atomic_");
+ emitType(op0->getDataType());
+ m_writer->emit(" ");
+ m_writer->emit(oldValName);
+ m_writer->emit(" = ");
+
+ m_writer->emit("((atomic_");
+ emitType(op0->getDataType());
+ m_writer->emit(")(");
+ emitOperand(op0, getInfo(EmitOp::General));
+ m_writer->emit("));\n");
+ return true;
+ }
}
return false;
}
diff --git a/source/slang/slang-ir-inst-defs.h b/source/slang/slang-ir-inst-defs.h
index f639d3343..19117c00e 100644
--- a/source/slang/slang-ir-inst-defs.h
+++ b/source/slang/slang-ir-inst-defs.h
@@ -685,6 +685,7 @@ INST(GetLegalizedSPIRVGlobalParamAddr, GetLegalizedSPIRVGlobalParamAddr, 1, 0)
INST(GetPerVertexInputArray, GetPerVertexInputArray, 1, 0)
INST(ForceVarIntoStructTemporarily, ForceVarIntoStructTemporarily, 1, 0)
+INST(MetalAtomicCast, MetalAtomicCast, 1, 0)
INST(MakeArrayList, makeArrayList, 0, 0)
INST(MakeTensorView, makeTensorView, 0, 0)
diff --git a/tests/bugs/atomic-coerce.slang b/tests/bugs/atomic-coerce.slang
index 2fe927355..bfb0eeb63 100644
--- a/tests/bugs/atomic-coerce.slang
+++ b/tests/bugs/atomic-coerce.slang
@@ -1,6 +1,6 @@
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST(compute,vulkan):COMPARE_COMPUTE_EX:-vk -slang -compute -shaderobj
-//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl
+//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl
//TEST_INPUT:ubuffer(data=[0 0 0 0 ], stride=4):out,name outputBuffer
RWStructuredBuffer<int> outputBuffer;
diff --git a/tests/compute/atomics-groupshared.slang b/tests/compute/atomics-groupshared.slang
index fcfc9c8d7..a01f7bf6a 100644
--- a/tests/compute/atomics-groupshared.slang
+++ b/tests/compute/atomics-groupshared.slang
@@ -4,7 +4,7 @@
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj
-//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl
+//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl
//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
diff --git a/tests/compute/atomics.slang b/tests/compute/atomics.slang
index b00f437f5..ee02c623f 100644
--- a/tests/compute/atomics.slang
+++ b/tests/compute/atomics.slang
@@ -4,7 +4,7 @@
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj
-//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl
+//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl
//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out, name outputBuffer
diff --git a/tests/metal/atomic-intrinsics.slang b/tests/metal/atomic-intrinsics.slang
new file mode 100644
index 000000000..3533ea2aa
--- /dev/null
+++ b/tests/metal/atomic-intrinsics.slang
@@ -0,0 +1,352 @@
+//TEST:SIMPLE(filecheck=MTL):-target metal -entry computeMain -stage compute -DMETAL
+//TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-directly -compute -shaderobj -output-using-type
+
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type
+
+
+//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=uintBuffer
+RWStructuredBuffer<uint> uintBuffer;
+//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=intBuffer
+RWStructuredBuffer<int> intBuffer;
+
+groupshared uint shareMemUI[4];
+groupshared int shareMemI[4];
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(4, 1, 1)]
+void computeMain(uint groupIndex : SV_GroupIndex)
+{
+ if (groupIndex == 0)
+ {
+ for (int i = 0; i < 4; ++i)
+ {
+ shareMemUI[i] = 0U;
+ shareMemI[i] = 0;
+ }
+ }
+ AllMemoryBarrierWithGroupSync();
+
+ int idx = groupIndex;
+ float val = 0.0f;
+
+ // InterlockedAdd
+ //MTL: atomic_uint threadgroup* {{.*}}shareMemUI
+ //LIB: call {{.*}}.atomic.local.add.u.i32
+ InterlockedAdd(shareMemUI[idx], uint(1));
+ val += shareMemUI[idx];
+
+ //MTL: atomic_int threadgroup* {{.*}}shareMemI
+ //LIB: call {{.*}}.atomic.local.add.s.i32
+ InterlockedAdd(shareMemI[idx], 2);
+ val += shareMemI[idx];
+
+ //MTL: atomic_uint device* {{.*}}uintBuffer
+ //LIB: call {{.*}}.atomic.global.add.u.i32
+ InterlockedAdd(uintBuffer[idx], 1);
+ val += uintBuffer[idx];
+
+ //MTL: atomic_int device* {{.*}}intBuffer
+ //LIB: call {{.*}}.atomic.global.add.s.i32
+ InterlockedAdd(intBuffer[idx], 2);
+ val += intBuffer[idx];
+
+ //LIB: call {{.*}}.atomic.local.add.s.i32
+ InterlockedAdd(shareMemI[idx], -1);
+ val += shareMemI[idx];
+
+ //LIB: call {{.*}}.atomic.global.add.s.i32
+ InterlockedAdd(intBuffer[idx], -1);
+ val += intBuffer[idx];
+
+ // InterlockedAdd - original_value
+ uint origui = 0;
+ //LIB: call {{.*}}.atomic.local.add.u.i32
+ InterlockedAdd(shareMemUI[idx], 1, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ int origi = 0;
+ //LIB: call {{.*}}.atomic.local.add.s.i32
+ InterlockedAdd(shareMemI[idx], 2, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.add.u.i32
+ InterlockedAdd(uintBuffer[idx], 1, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.add.s.i32
+ InterlockedAdd(intBuffer[idx], 2, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.local.add.s.i32
+ InterlockedAdd(shareMemI[idx], -1, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.add.s.i32
+ InterlockedAdd(intBuffer[idx], -1, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedAnd
+ //LIB: call {{.*}}.atomic.local.and.u.i32
+ InterlockedAnd(shareMemUI[idx], 255);
+ val += shareMemUI[idx];
+
+ //LIB: call {{.*}}.atomic.local.and.s.i32
+ InterlockedAnd(shareMemI[idx], 255);
+ val += shareMemI[idx];
+
+ //LIB: call {{.*}}.atomic.global.and.u.i32
+ InterlockedAnd(uintBuffer[idx], 255);
+ val += uintBuffer[idx];
+
+ //LIB: call {{.*}}.atomic.global.and.s.i32
+ InterlockedAnd(intBuffer[idx], 255);
+ val += intBuffer[idx];
+
+ // InterlockedAnd - original_value
+ //LIB: call {{.*}}.atomic.local.and.u.i32
+ InterlockedAnd(shareMemUI[idx], 255, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.and.s.i32
+ InterlockedAnd(shareMemI[idx], 255, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.and.u.i32
+ InterlockedAnd(uintBuffer[idx], 255, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.and.s.i32
+ InterlockedAnd(intBuffer[idx], 255, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedCompareExchange
+ //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32
+ InterlockedCompareExchange(shareMemUI[idx], 1, 0, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32
+ InterlockedCompareExchange(shareMemI[idx], 1, 0, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32
+ InterlockedCompareExchange(uintBuffer[idx], 1, 0, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32
+ InterlockedCompareExchange(intBuffer[idx], 1, 0, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedCompareStore is not supported by Metal
+#if !defined(METAL)
+ InterlockedCompareStore(shareMemUI[idx], 255, 0);
+ val += shareMemUI[idx];
+
+ InterlockedCompareStore(shareMemI[idx], 255, 0);
+ val += shareMemI[idx];
+
+ InterlockedCompareStore(uintBuffer[idx], 255, 0);
+ val += uintBuffer[idx];
+
+ InterlockedCompareStore(intBuffer[idx], 255, 0);
+ val += intBuffer[idx];
+#endif
+
+ // InterlockedExchange
+ //LIB: call {{.*}}.atomic.local.xchg.i32
+ InterlockedExchange(shareMemUI[idx], 1, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.xchg.i32
+ InterlockedExchange(shareMemI[idx], 1, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.xchg.i32
+ InterlockedExchange(uintBuffer[idx], 1, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.xchg.i32
+ InterlockedExchange(intBuffer[idx], 1, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedMax
+ //LIB: call {{.*}}.atomic.local.max.u.i32
+ InterlockedMax(shareMemUI[idx], 0);
+ val += shareMemUI[idx];
+
+ //LIB: call {{.*}}.atomic.local.max.s.i32
+ InterlockedMax(shareMemI[idx], 0);
+ val += shareMemI[idx];
+
+ //LIB: call {{.*}}.atomic.global.max.u.i32
+ InterlockedMax(uintBuffer[idx], 0);
+ val += uintBuffer[idx];
+
+ //LIB: call {{.*}}.atomic.global.max.s.i32
+ InterlockedMax(intBuffer[idx], 0);
+ val += intBuffer[idx];
+
+ // InterlockedMax - original_value
+ //LIB: call {{.*}}.atomic.local.max.u.i32
+ InterlockedMax(shareMemUI[idx], 0, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.max.s.i32
+ InterlockedMax(shareMemI[idx], 0, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.max.u.i32
+ InterlockedMax(uintBuffer[idx], 0, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.max.s.i32
+ InterlockedMax(intBuffer[idx], 0, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedMin
+ //LIB: call {{.*}}.atomic.local.min.u.i32
+ InterlockedMin(shareMemUI[idx], 0);
+ val += shareMemUI[idx];
+
+ //LIB: call {{.*}}.atomic.local.min.s.i32
+ InterlockedMin(shareMemI[idx], 0);
+ val += shareMemI[idx];
+
+ //LIB: call {{.*}}.atomic.global.min.u.i32
+ InterlockedMin(uintBuffer[idx], 0);
+ val += uintBuffer[idx];
+
+ //LIB: call {{.*}}.atomic.global.min.s.i32
+ InterlockedMin(intBuffer[idx], 0);
+ val += intBuffer[idx];
+
+ // InterlockedMin - original_value
+ //LIB: call {{.*}}.atomic.local.min.u.i32
+ InterlockedMin(shareMemUI[idx], 0, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.min.s.i32
+ InterlockedMin(shareMemI[idx], 0, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.min.u.i32
+ InterlockedMin(uintBuffer[idx], 0, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.min.s.i32
+ InterlockedMin(intBuffer[idx], 0, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedOr
+ //LIB: call {{.*}}.atomic.local.or.u.i32
+ InterlockedOr(shareMemUI[idx], 2);
+ val += shareMemUI[idx];
+
+ //LIB: call {{.*}}.atomic.local.or.s.i32
+ InterlockedOr(shareMemI[idx], 4);
+ val += shareMemI[idx];
+
+ //LIB: call {{.*}}.atomic.global.or.u.i32
+ InterlockedOr(uintBuffer[idx], 6);
+ val += uintBuffer[idx];
+
+ //LIB: call {{.*}}.atomic.global.or.s.i32
+ InterlockedOr(intBuffer[idx], 8);
+ val += intBuffer[idx];
+
+ // InterlockedOr - original_value
+ //LIB: call {{.*}}.atomic.local.or.u.i32
+ InterlockedOr(shareMemUI[idx], 2, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.or.s.i32
+ InterlockedOr(shareMemI[idx], 4, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.or.u.i32
+ InterlockedOr(uintBuffer[idx], 6, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.or.s.i32
+ InterlockedOr(intBuffer[idx], 8, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ // InterlockedXor
+ //LIB: call {{.*}}.atomic.local.xor.u.i32
+ InterlockedXor(shareMemUI[idx], 2);
+ val += shareMemUI[idx];
+
+ //LIB: call {{.*}}.atomic.local.xor.s.i32
+ InterlockedXor(shareMemI[idx], 4);
+ val += shareMemI[idx];
+
+ //LIB: call {{.*}}.atomic.global.xor.u.i32
+ InterlockedXor(uintBuffer[idx], 6);
+ val += uintBuffer[idx];
+
+ //LIB: call {{.*}}.atomic.global.xor.s.i32
+ InterlockedXor(intBuffer[idx], 8);
+ val += intBuffer[idx];
+
+ // InterlockedXor - original_value
+ //LIB: call {{.*}}.atomic.local.xor.u.i32
+ InterlockedXor(shareMemUI[idx], 2, origui);
+ val += shareMemUI[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.local.xor.s.i32
+ InterlockedXor(shareMemI[idx], 4, origi);
+ val += shareMemI[idx];
+ val += origi;
+
+ //LIB: call {{.*}}.atomic.global.xor.u.i32
+ InterlockedXor(uintBuffer[idx], 6, origui);
+ val += uintBuffer[idx];
+ val += origui;
+
+ //LIB: call {{.*}}.atomic.global.xor.s.i32
+ InterlockedXor(intBuffer[idx], 8, origi);
+ val += intBuffer[idx];
+ val += origi;
+
+ outputBuffer[idx] = val;
+}
+
+// CHK: 184
+// CHK: 207
+// CHK: 230
+// CHK: 253