From 969dd4cc7246bfe89103efcb00f399606e804e98 Mon Sep 17 00:00:00 2001 From: Jay Kwak <82421531+jkwak-work@users.noreply.github.com> Date: Tue, 25 Jun 2024 22:07:41 -0700 Subject: Support atomic intrinsics for Metal (#4473) * Support atomic intrinsics for Metal This commit adds a support for the atomic intrinsics in Metal. The atomic member functions for buffers is not implemented yet. Metal requires the first argument for the atomic functions to be an atomic data type. This implementation rely on the fact that we can do a C-style type casting from a regular data type to an atomic data type. --- tests/bugs/atomic-coerce.slang | 2 +- tests/compute/atomics-groupshared.slang | 2 +- tests/compute/atomics.slang | 2 +- tests/metal/atomic-intrinsics.slang | 352 ++++++++++++++++++++++++++++++++ 4 files changed, 355 insertions(+), 3 deletions(-) create mode 100644 tests/metal/atomic-intrinsics.slang (limited to 'tests') diff --git a/tests/bugs/atomic-coerce.slang b/tests/bugs/atomic-coerce.slang index 2fe927355..bfb0eeb63 100644 --- a/tests/bugs/atomic-coerce.slang +++ b/tests/bugs/atomic-coerce.slang @@ -1,6 +1,6 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj //TEST(compute,vulkan):COMPARE_COMPUTE_EX:-vk -slang -compute -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0 ], stride=4):out,name outputBuffer RWStructuredBuffer outputBuffer; diff --git a/tests/compute/atomics-groupshared.slang b/tests/compute/atomics-groupshared.slang index fcfc9c8d7..a01f7bf6a 100644 --- a/tests/compute/atomics-groupshared.slang +++ b/tests/compute/atomics-groupshared.slang @@ -4,7 +4,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer diff --git a/tests/compute/atomics.slang b/tests/compute/atomics.slang index b00f437f5..ee02c623f 100644 --- a/tests/compute/atomics.slang +++ b/tests/compute/atomics.slang @@ -4,7 +4,7 @@ //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -vk -shaderobj //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -cuda -shaderobj -//DISABLE_TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl +//TEST(compute):COMPARE_COMPUTE:-slang -shaderobj -mtl //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out, name outputBuffer diff --git a/tests/metal/atomic-intrinsics.slang b/tests/metal/atomic-intrinsics.slang new file mode 100644 index 000000000..3533ea2aa --- /dev/null +++ b/tests/metal/atomic-intrinsics.slang @@ -0,0 +1,352 @@ +//TEST:SIMPLE(filecheck=MTL):-target metal -entry computeMain -stage compute -DMETAL +//TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type +//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-directly -compute -shaderobj -output-using-type + +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type +//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type + + +//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=uintBuffer +RWStructuredBuffer uintBuffer; +//TEST_INPUT:ubuffer(data=[0 1 2 3], stride=4):name=intBuffer +RWStructuredBuffer intBuffer; + +groupshared uint shareMemUI[4]; +groupshared int shareMemI[4]; + +//TEST_INPUT: ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer outputBuffer; + +[numthreads(4, 1, 1)] +void computeMain(uint groupIndex : SV_GroupIndex) +{ + if (groupIndex == 0) + { + for (int i = 0; i < 4; ++i) + { + shareMemUI[i] = 0U; + shareMemI[i] = 0; + } + } + AllMemoryBarrierWithGroupSync(); + + int idx = groupIndex; + float val = 0.0f; + + // InterlockedAdd + //MTL: atomic_uint threadgroup* {{.*}}shareMemUI + //LIB: call {{.*}}.atomic.local.add.u.i32 + InterlockedAdd(shareMemUI[idx], uint(1)); + val += shareMemUI[idx]; + + //MTL: atomic_int threadgroup* {{.*}}shareMemI + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], 2); + val += shareMemI[idx]; + + //MTL: atomic_uint device* {{.*}}uintBuffer + //LIB: call {{.*}}.atomic.global.add.u.i32 + InterlockedAdd(uintBuffer[idx], 1); + val += uintBuffer[idx]; + + //MTL: atomic_int device* {{.*}}intBuffer + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], 2); + val += intBuffer[idx]; + + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], -1); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], -1); + val += intBuffer[idx]; + + // InterlockedAdd - original_value + uint origui = 0; + //LIB: call {{.*}}.atomic.local.add.u.i32 + InterlockedAdd(shareMemUI[idx], 1, origui); + val += shareMemUI[idx]; + val += origui; + + int origi = 0; + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], 2, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.add.u.i32 + InterlockedAdd(uintBuffer[idx], 1, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], 2, origi); + val += intBuffer[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.local.add.s.i32 + InterlockedAdd(shareMemI[idx], -1, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.add.s.i32 + InterlockedAdd(intBuffer[idx], -1, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedAnd + //LIB: call {{.*}}.atomic.local.and.u.i32 + InterlockedAnd(shareMemUI[idx], 255); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.and.s.i32 + InterlockedAnd(shareMemI[idx], 255); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.and.u.i32 + InterlockedAnd(uintBuffer[idx], 255); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.and.s.i32 + InterlockedAnd(intBuffer[idx], 255); + val += intBuffer[idx]; + + // InterlockedAnd - original_value + //LIB: call {{.*}}.atomic.local.and.u.i32 + InterlockedAnd(shareMemUI[idx], 255, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.and.s.i32 + InterlockedAnd(shareMemI[idx], 255, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.and.u.i32 + InterlockedAnd(uintBuffer[idx], 255, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.and.s.i32 + InterlockedAnd(intBuffer[idx], 255, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedCompareExchange + //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32 + InterlockedCompareExchange(shareMemUI[idx], 1, 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.cmpxchg.weak.i32 + InterlockedCompareExchange(shareMemI[idx], 1, 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32 + InterlockedCompareExchange(uintBuffer[idx], 1, 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.cmpxchg.weak.i32 + InterlockedCompareExchange(intBuffer[idx], 1, 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedCompareStore is not supported by Metal +#if !defined(METAL) + InterlockedCompareStore(shareMemUI[idx], 255, 0); + val += shareMemUI[idx]; + + InterlockedCompareStore(shareMemI[idx], 255, 0); + val += shareMemI[idx]; + + InterlockedCompareStore(uintBuffer[idx], 255, 0); + val += uintBuffer[idx]; + + InterlockedCompareStore(intBuffer[idx], 255, 0); + val += intBuffer[idx]; +#endif + + // InterlockedExchange + //LIB: call {{.*}}.atomic.local.xchg.i32 + InterlockedExchange(shareMemUI[idx], 1, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.xchg.i32 + InterlockedExchange(shareMemI[idx], 1, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.xchg.i32 + InterlockedExchange(uintBuffer[idx], 1, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.xchg.i32 + InterlockedExchange(intBuffer[idx], 1, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedMax + //LIB: call {{.*}}.atomic.local.max.u.i32 + InterlockedMax(shareMemUI[idx], 0); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.max.s.i32 + InterlockedMax(shareMemI[idx], 0); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.max.u.i32 + InterlockedMax(uintBuffer[idx], 0); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.max.s.i32 + InterlockedMax(intBuffer[idx], 0); + val += intBuffer[idx]; + + // InterlockedMax - original_value + //LIB: call {{.*}}.atomic.local.max.u.i32 + InterlockedMax(shareMemUI[idx], 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.max.s.i32 + InterlockedMax(shareMemI[idx], 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.max.u.i32 + InterlockedMax(uintBuffer[idx], 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.max.s.i32 + InterlockedMax(intBuffer[idx], 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedMin + //LIB: call {{.*}}.atomic.local.min.u.i32 + InterlockedMin(shareMemUI[idx], 0); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.min.s.i32 + InterlockedMin(shareMemI[idx], 0); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.min.u.i32 + InterlockedMin(uintBuffer[idx], 0); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.min.s.i32 + InterlockedMin(intBuffer[idx], 0); + val += intBuffer[idx]; + + // InterlockedMin - original_value + //LIB: call {{.*}}.atomic.local.min.u.i32 + InterlockedMin(shareMemUI[idx], 0, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.min.s.i32 + InterlockedMin(shareMemI[idx], 0, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.min.u.i32 + InterlockedMin(uintBuffer[idx], 0, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.min.s.i32 + InterlockedMin(intBuffer[idx], 0, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedOr + //LIB: call {{.*}}.atomic.local.or.u.i32 + InterlockedOr(shareMemUI[idx], 2); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.or.s.i32 + InterlockedOr(shareMemI[idx], 4); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.or.u.i32 + InterlockedOr(uintBuffer[idx], 6); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.or.s.i32 + InterlockedOr(intBuffer[idx], 8); + val += intBuffer[idx]; + + // InterlockedOr - original_value + //LIB: call {{.*}}.atomic.local.or.u.i32 + InterlockedOr(shareMemUI[idx], 2, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.or.s.i32 + InterlockedOr(shareMemI[idx], 4, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.or.u.i32 + InterlockedOr(uintBuffer[idx], 6, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.or.s.i32 + InterlockedOr(intBuffer[idx], 8, origi); + val += intBuffer[idx]; + val += origi; + + // InterlockedXor + //LIB: call {{.*}}.atomic.local.xor.u.i32 + InterlockedXor(shareMemUI[idx], 2); + val += shareMemUI[idx]; + + //LIB: call {{.*}}.atomic.local.xor.s.i32 + InterlockedXor(shareMemI[idx], 4); + val += shareMemI[idx]; + + //LIB: call {{.*}}.atomic.global.xor.u.i32 + InterlockedXor(uintBuffer[idx], 6); + val += uintBuffer[idx]; + + //LIB: call {{.*}}.atomic.global.xor.s.i32 + InterlockedXor(intBuffer[idx], 8); + val += intBuffer[idx]; + + // InterlockedXor - original_value + //LIB: call {{.*}}.atomic.local.xor.u.i32 + InterlockedXor(shareMemUI[idx], 2, origui); + val += shareMemUI[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.local.xor.s.i32 + InterlockedXor(shareMemI[idx], 4, origi); + val += shareMemI[idx]; + val += origi; + + //LIB: call {{.*}}.atomic.global.xor.u.i32 + InterlockedXor(uintBuffer[idx], 6, origui); + val += uintBuffer[idx]; + val += origui; + + //LIB: call {{.*}}.atomic.global.xor.s.i32 + InterlockedXor(intBuffer[idx], 8, origi); + val += intBuffer[idx]; + val += origi; + + outputBuffer[idx] = val; +} + +// CHK: 184 +// CHK: 207 +// CHK: 230 +// CHK: 253 -- cgit v1.2.3