Cleanup atomic intrinsics. (#5324)

* Cleanup atomic intrinsics. * Fix. * Fix glsl. * Remove hacky intrinsic expansion logic for glsl image atomics. * Fix all tests. * Fix. * Add `InterlockedAddF16Emulated`. * Fix glsl intrinsic. * Fix.
author: Yong He <yonghe@outlook.com> 2024-10-17 20:14:22 -0700
committer: GitHub <noreply@github.com> 2024-10-17 20:14:22 -0700
commit: a618b8c5e249b0f20e6c0c95f9da1b5cbfdbf08b (patch)
tree: d583c373d574a265fefe7f288a96c4b382e259b8
parent: 11e1ecafa09396a3559fe245d729b40ce4f25d52 (diff)
33 files changed, 1040 insertions, 2564 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 96ef22dd1..a6c8fd17b 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1261,7 +1261,14 @@ struct ByteAddressBuffer
         memcpy(&data, ((const char*)this->data) + index, sizeof(T));
         return data;
     }
-    
+    template<typename T>
+    SLANG_CUDA_CALL StructuredBuffer<T> asStructuredBuffer() const
+    {
+        StructuredBuffer<T> rs;
+        rs.data = (T*)data;
+        rs.count = sizeInBytes / sizeof(T);
+        return rs;
+    }
     const uint32_t* data;
     size_t sizeInBytes;  //< Must be multiple of 4
 };
@@ -1348,7 +1355,14 @@ struct RWByteAddressBuffer
         SLANG_BOUND_CHECK_BYTE_ADDRESS(index, sizeof(T), sizeInBytes);
         return (T*)(((char*)data) + index);
     }
-    
+    template<typename T>
+    SLANG_CUDA_CALL RWStructuredBuffer<T> asStructuredBuffer() const
+    {
+        RWStructuredBuffer<T> rs;
+        rs.data = (T*)data;
+        rs.count = sizeInBytes / sizeof(T);
+        return rs;
+    }
     uint32_t* data;
     size_t sizeInBytes; //< Must be multiple of 4 
 };
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 084654d0f..67ec91cf6 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -299,6 +299,18 @@ interface __BuiltinSignedArithmeticType : __BuiltinArithmeticType {}
 interface __BuiltinIntegerType : __BuiltinArithmeticType, IInteger
 {}
 
+/// Represent a `int` or `uint` type.
+[sealed]
+[builtin]
+interface __BuiltinInt32Type : __BuiltinIntegerType
+{}
+
+/// Represent a `int64_t` or `uint64_t` type.
+[sealed]
+[builtin]
+interface __BuiltinInt64Type : __BuiltinIntegerType
+{}
+
 /// Represent builtin types that can represent a real number.
 [sealed]
 [builtin]
@@ -603,6 +615,14 @@ ${{{{
     ,  __BuiltinArithmeticType
     ,  __BuiltinIntegerType
 ${{{{
+    if (kBaseTypes[tt].tag == BaseType::Int || kBaseTypes[tt].tag == BaseType::UInt)
+}}}}
+    ,  __BuiltinInt32Type
+${{{{
+    if (kBaseTypes[tt].tag == BaseType::Int64 || kBaseTypes[tt].tag == BaseType::UInt64)
+}}}}
+    ,  __BuiltinInt64Type
+${{{{
         ; // fall through
     case BaseType::Bool:
 }}}}
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 191fa3195..1c01c2f6b 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -3923,475 +3923,36 @@ ${{{{
 }
 }}}}
 
-// AtomicAdd
 
-// Make the GLSL atomicAdd available.
-// We have separate int/float implementations, as the float version requires some specific extensions
-// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_float)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_float1)]
-float __atomicAdd(__ref float value, float amount)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpExtension "SPV_EXT_shader_atomic_float_add";
-            OpCapability AtomicFloat32AddEXT;
-            result:$$float = OpAtomicFAddEXT &value Device None $amount
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_NV_shader_atomic_fp16_vector)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_halfvec)]
-half2 __atomicAdd(__ref half2 value, half2 amount)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpExtension "SPV_EXT_shader_atomic_float_add";
-            OpCapability AtomicFloat32AddEXT;
-            result:$$half2 = OpAtomicFAddEXT &value Device None $amount
-        };
-    }
-}
-
-// Helper for hlsl, using NVAPI
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedAddUint64($0, $1, $2)";
-    }
-}
-
-// atomic add for hlsl using SM6.6
-[require(hlsl, atomic_hlsl_sm_6_6)]
-void __atomicAdd(RWByteAddressBuffer buf, uint offset, int64_t value, out int64_t originalValue)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "$0.InterlockedAdd64($1, $2, $3)";
-    }
-}
-
-[require(hlsl, atomic_hlsl_sm_6_6)]
-void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint64_t originalValue)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "$0.InterlockedAdd64($1, $2, $3)";
-    }
-}
-
-// Int versions require glsl 4.30
-// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml
-
-__glsl_version(430)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl)]
-int __atomicAdd(__ref int value, int amount)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            result:$$int = OpAtomicIAdd &value Device None $amount;
-        };
-    }
-}
-
-__glsl_version(430)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl)]
-uint __atomicAdd(__ref uint value, uint amount)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            result:$$uint = OpAtomicIAdd &value Device None $amount;
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-int64_t __atomicAdd(__ref int64_t value, int64_t amount)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$int64_t = OpAtomicIAdd &value Device None $amount
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicIAdd &value Device None $amount
-        };
-    }
-}
-
-// Cas - Compare and swap
-
-// Helper for HLSL, using NVAPI
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)";
-    }
-}
-
-// CAS using SM6.6
-[require(hlsl, atomic_hlsl_sm_6_6)]
-void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in int64_t value, out int64_t original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "$0.InterlockedCompareExchange64($1, $2, $3, $4)";
-    }
-}
-
-[require(hlsl, atomic_hlsl_sm_6_6)]
-void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "$0.InterlockedCompareExchange64($1, $2, $3, $4)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-int64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$int64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue
-        };
-    }
-}
-
-// Max
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedMaxUint64($0, $1, $2)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicMax($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicUMax &ioValue Device None $value
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_float2)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_float2)]
-float __atomicMax(__ref float ioValue, float value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicMax($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpExtension "SPV_EXT_shader_atomic_float_min_max";
-            OpCapability AtomicFloat32MinMaxEXT;
-            result:$$float = OpAtomicFMaxEXT &ioValue Device None $value
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_float2)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_float2)]
-half __atomicMax(__ref half ioValue, half value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicMax($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpExtension "SPV_EXT_shader_atomic_float_min_max";
-            OpCapability AtomicFloat16MinMaxEXT;
-            result:$$half = OpAtomicFMaxEXT &ioValue Device None $value
-        };
-    }
-}
-
-// Min
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedMinUint64($0, $1, $2)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicMin($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicUMin &ioValue Device None $value
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_float2)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_float2)]
-float __atomicMin(__ref float ioValue, float value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicMin($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpExtension "SPV_EXT_shader_atomic_float_min_max";
-            OpCapability AtomicFloat32MinMaxEXT;
-            result:$$float = OpAtomicFMinEXT &ioValue Device None $value
-        };
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_float2)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_float2)]
-half __atomicMin(__ref half ioValue, half value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicMin($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpExtension "SPV_EXT_shader_atomic_float_min_max";
-            OpCapability AtomicFloat16MinMaxEXT;
-            result:$$half = OpAtomicFMinEXT &ioValue Device None $value
-        };
-    }
-}
-
-// And
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedAndUint64($0, $1, $2)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicAnd($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicAnd &ioValue Device None $value
-        };
-    }
-}
-
-// Or
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedOrUint64($0, $1, $2)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicOr($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicOr &ioValue Device None $value
-        };
-    }
-}
-
-// Xor
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedXorUint64($0, $1, $2)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicXor($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicXor &ioValue Device None $value
-        };
-    }
-}
-
-// Exchange
-
-[__requiresNVAPI]
-[require(hlsl, atomic_hlsl_nvapi)]
-uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "NvInterlockedExchangeUint64($0, $1, $2)";
-    }
-}
-
-__glsl_version(430)
-__glsl_extension(GL_EXT_shader_atomic_int64)
-[ForceInline]
-[require(glsl_spirv, atomic_glsl_int64)]
-uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value)
-{
-    __target_switch
-    {
-    case glsl: __intrinsic_asm "atomicExchange($0, $1)";
-    case spirv:
-        return spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$uint64_t = OpAtomicExchange &ioValue Device None $value
-        };
-    }
-}
+// Atomic intrinsic insts.
+
+__intrinsic_op($(kIROp_AtomicExchange))
+T __atomic_exchange<T>(__ref T val, T newValue, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicCompareExchange))
+T __atomic_compare_exchange<T>(
+    __ref T val,
+    T compareValue,
+    T newValue,
+    MemoryOrder successOrder = MemoryOrder.Relaxed,
+    MemoryOrder failOrder = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicAdd))
+T __atomic_add<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicSub))
+T __atomic_sub<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicMax))
+T __atomic_max<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicMin))
+T __atomic_min<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicAnd))
+T __atomic_and<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicOr))
+T __atomic_or<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicXor))
+T __atomic_xor<T>(__ref T val, T value, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicInc))
+T __atomic_increment<T>(__ref T val, MemoryOrder order = MemoryOrder.Relaxed);
+__intrinsic_op($(kIROp_AtomicDec))
+T __atomic_decrement<T>(__ref T val, MemoryOrder order = MemoryOrder.Relaxed);
 
 // Conversion between uint64_t and uint2
 
@@ -4802,6 +4363,20 @@ struct $(item.name)
     }
 
 ${{{{
+    struct BufferAtomicOps
+    {
+        const char* name;
+        const char* internalName;
+    };
+    const BufferAtomicOps bufferAtomicOps[] = {
+        {"Max", "max"},
+        {"Min", "min"},
+        {"Add", "add"},
+        {"And", "and"},
+        {"Or", "or"},
+        {"Xor", "xor"},
+        {"Exchange", "exchange"}
+    };
     if (item.op == kIROp_HLSLRWByteAddressBufferType)
     {
 }}}}
@@ -4822,6 +4397,13 @@ ${{{{
 
     // F32 Add
 
+    /// Perform a 32-bit floating point atomic add operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic add operation.
+    /// @param valueToAdd The value to add to the value at `byteAddress`.
+    /// @param originalValue The original value at `byteAddress` before the add operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicFAdd`. For HLSL, this function translates to an NVAPI call
+    /// due to lack of native HLSL intrinsic for floating point atomic add. For CUDA, this function
+    /// maps to `atomicAdd`.
     __cuda_sm_version(2.0)
     [__requiresNVAPI]
     [ForceInline]
@@ -4832,35 +4414,45 @@ ${{{{
         {
         case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))";
         case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))";
-        case metal:
-            {
-                let buf = __getEquivalentStructuredBuffer<float>(this);
-                __metalInterlocked_add(__getMetalAtomicRef(buf[byteAddress / 4]), valueToAdd, originalValue);
-                return;
-            }
-        case glsl:
-        case spirv:
+        default:
             {
                 let buf = __getEquivalentStructuredBuffer<float>(this);
-                originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
+                originalValue = __atomic_add(buf[byteAddress / 4], valueToAdd);
                 return;
             }
         }
     }
 
     // FP16x2
+
+    /// @internal
+    /// Maps to the `NvInterlockedAddFp16x2` NVAPI function.
+    ///
     [__requiresNVAPI]
     [ForceInline]
-    [require(hlsl, atomic_hlsl_nvapi)]
+    [require(cuda_hlsl_spirv)]
     uint _NvInterlockedAddFp16x2(uint byteAddress, uint fp16x2Value)
     {
         __target_switch
         {
         case hlsl:
             __intrinsic_asm "NvInterlockedAddFp16x2($0, $1, $2)";
+        default:
+            let buf = __getEquivalentStructuredBuffer<half2>(this);
+            return bit_cast<uint>(__atomic_add(buf[byteAddress / 4], bit_cast<half2>(fp16x2Value)));
         }
     }
 
+
+    /// Perform a 16-bit floating point atomic add operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic add operation.
+    /// @param valueToAdd The value to add to the value at `byteAddress`.
+    /// @param originalValue The original value at `byteAddress` before the add operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicFAdd` and requires `SPV_EXT_shader_atomic_float16_add` extension.
+    ///
+    /// For HLSL, this function translates to an NVAPI call
+    /// due to lack of native HLSL intrinsic for floating point atomic add. For CUDA, this function
+    /// maps to `atomicAdd`.
     [__requiresNVAPI]
     [ForceInline]
     void InterlockedAddF16(uint byteAddress, half value, out half originalValue)
@@ -4880,17 +4472,55 @@ ${{{{
                 originalValue = asfloat16((uint16_t)(_NvInterlockedAddFp16x2(byteAddress, packedInput) >> 16));
             }
             return;
-        case glsl:
-        case spirv:
+        default:
+            {
+                let buf = __getEquivalentStructuredBuffer<half>(this);
+                originalValue = __atomic_add(buf[byteAddress/2], value);
+                return;
+            }
+        }
+    }
+
+    /// Perform a 16-bit floating point atomic add operation at `byteAddress` through emulation using `half2` atomics.
+    /// @param byteAddress The address at which to perform the atomic add operation.
+    /// @param valueToAdd The value to add to the value at `byteAddress`.
+    /// @param originalValue The original value at `byteAddress` before the add operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicFAdd` on a `half2` vector with the correct part set to `value`
+    /// and the remaining part set to 0. This requires the `AtomicFloat16VectorNV` capability introduced by the `SPV_NV_shader_atomic_fp16_vector`
+    /// extension.
+    ///
+    /// For HLSL, this function translates to an equivalent NVAPI call
+    /// due to lack of native HLSL intrinsic for floating point atomic add. For CUDA, this function
+    /// maps to `atomicAdd`.
+    [__requiresNVAPI]
+    [ForceInline]
+    void InterlockedAddF16Emulated(uint byteAddress, half value, out half originalValue)
+    {
+        __target_switch
+        {
+        case hlsl:
+            if ((byteAddress & 2) == 0)
+            {
+                uint packedInput = asuint16(value);
+                originalValue = asfloat16((uint16_t)_NvInterlockedAddFp16x2(byteAddress, packedInput));
+            }
+            else
+            {
+                byteAddress = byteAddress & ~3;
+                uint packedInput = ((uint)asuint16(value)) << 16;
+                originalValue = asfloat16((uint16_t)(_NvInterlockedAddFp16x2(byteAddress, packedInput) >> 16));
+            }
+            return;
+        default:
             {
                 let buf = __getEquivalentStructuredBuffer<half2>(this);
                 if ((byteAddress & 2) == 0)
                 {
-                    originalValue = __atomicAdd(buf[byteAddress/4], half2(value, half(0.0))).x;
+                    originalValue = __atomic_add(buf[byteAddress/4], half2(value, half(0.0))).x;
                 }
                 else
                 {
-                    originalValue = __atomicAdd(buf[byteAddress/4], half2(half(0.0), value)).y;
+                    originalValue = __atomic_add(buf[byteAddress/4], half2(half(0.0), value)).y;
                 }
                 return;
             }
@@ -4908,484 +4538,207 @@ ${{{{
         __target_switch
         {
         case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))";
-        case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<float>($1), $2)";
-        case metal:
-            {
-                let buf = __getEquivalentStructuredBuffer<float>(this);
-                __metalInterlocked_add(__getMetalAtomicRef(buf[byteAddress / 4]), valueToAdd);
-                return;
-            }
-        case glsl:
-        case spirv:
+        default:
             {
                 let buf = __getEquivalentStructuredBuffer<float>(this);
-                __atomicAdd(buf[byteAddress / 4], valueToAdd);
+                __atomic_add(buf[byteAddress / 4], valueToAdd);
                 return;
             }
         }
     }
 
     // Int64 Add
+
+    /// Perform a 64-bit integer atomic add operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic add operation.
+    /// @param valueToAdd The value to add to the value at `byteAddress`.
+    /// @param originalValue The original value at `byteAddress` before the add operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicAdd`. For HLSL, this function
+    /// translates to `InterlockedAdd64` and requires shader model 6.6.
+    /// For CUDA, this function maps to `atomicAdd`.
     [ForceInline]
-    __cuda_sm_version(6.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda6_int64)]
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
     {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))";
-        case hlsl:
-            originalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
-        case glsl:
-        case spirv:
-            {
-                let buf = __getEquivalentStructuredBuffer<int64_t>(this);
-                originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
-            }
-        }
+        InterlockedAdd64(byteAddress, valueToAdd, originalValue);
     }
 
     // Without returning original value
-    __cuda_sm_version(6.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda6_int64)]
+    [ForceInline]
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
     void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
     {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint64_t>($1), $2)";
-        case hlsl:
-            __atomicAdd(this, byteAddress, __asuint2(valueToAdd));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<int64_t>(this);
-            __atomicAdd(buf[byteAddress / 8], valueToAdd);
-        }
+        InterlockedAdd64(byteAddress, valueToAdd);
     }
 
     // Cas uint64_t
 
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda9_int64)]
+    /// Perform a 64-bit integer atomic compare-and-exchange operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic compare-and-exchange operation.
+    /// @param compareValue The value to compare to the value at `byteAddress`.
+    /// @param value The value to store at `byteAddress` if the comparison is successful.
+    /// @param originalValue The original value at `byteAddress` before the add operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function
+    /// translates to `InterlockedCompareExchange64` and requires shader model 6.6.
+    /// For CUDA, this function maps to `atomicCAS`.
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
     void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
     {
         __target_switch
         {
         case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint64_t>($1), $2, $3))";
         case hlsl:
-            outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value)));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
-        }
-    }
-
-    // Max
-
-    __cuda_sm_version(5.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)]
-    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint64_t>($1), $2)";
-        case hlsl:
-            return __asuint64(__atomicMax(this, byteAddress, __asuint2(value)));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            return __atomicMax(buf[byteAddress / 8], value);
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMax64(uint byteAddress, int64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMax64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMax64(uint byteAddress, int64_t value, out int64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMax64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMax64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMax64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMax64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMax64";
-        }
-    }
-
-    // Min
-
-    __cuda_sm_version(5.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)]
-    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint64_t>($1), $2)";
-        case hlsl:
-            return __asuint64(__atomicMin(this, byteAddress, __asuint2(value)));
-        case glsl:
-        case spirv:
+            __intrinsic_asm ".InterlockedCompareExchange64";
+        default:
             let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            return __atomicMin(buf[byteAddress / 8], value);
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMin64(uint byteAddress, int64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMin64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMin64(uint byteAddress, int64_t value, out int64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMin64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMin64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMin64";
+            outOriginalValue = __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value);
         }
     }
 
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedMin64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedMin64";
-        }
-    }
-
-    // And
-
-    __cuda_sm_version(5.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)]
-    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint64_t>($1), $2)";
-        case hlsl:
-            return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value)));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            return __atomicAnd(buf[byteAddress / 8], value);
-        }
-    }
+    // SM6.6 6 64bit atomics.
 
+    // InterlockedMax64, InterlockedMin64, InterlockedAdd64, InterlockedAnd64, InterlockedOr64, InterlockedXor64, InterlockedExchange64
+${{{{
+    for (auto op : bufferAtomicOps) {
+}}}}
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedAnd64(uint byteAddress, uint64_t value)
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
+    uint64_t Interlocked$(op.name)U64(uint byteAddress, uint64_t value)
     {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedAnd64";
-        }
+        uint64_t originalValue;
+        Interlocked$(op.name)64(byteAddress, value, originalValue);
+        return originalValue;
     }
 
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedAnd64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue)
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
+    void Interlocked$(op.name)64(uint byteAddress, int64_t value)
     {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedAnd64";
-        }
-    }
-
-    // Or
-
-    __cuda_sm_version(5.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)]
-    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint64_t>($1), $2)";
-        case hlsl:
-            return __asuint64(__atomicOr(this, byteAddress, __asuint2(value)));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            return __atomicOr(buf[byteAddress / 8], value);
-        }
+        int64_t oldValue;
+        Interlocked$(op.name)64(byteAddress, value, oldValue);
     }
 
+    /// Perform a 64-bit integer atomic $(op.internalName) operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic $(op.internalName) operation.
+    /// @param value The operand for the $(op.internalName) operation.
+    /// @param originalValue The original value at `byteAddress` before the $(op.internalName) operation.
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedOr64(uint byteAddress, uint64_t value)
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
+    void Interlocked$(op.name)64<T:__BuiltinInt64Type>(uint byteAddress, T value, out T outOriginalValue)
     {
         __target_switch
         {
-        case hlsl: __intrinsic_asm ".InterlockedOr64";
+        case hlsl: __intrinsic_asm ".Interlocked$(op.name)64";
+        default:
+            let buf = __getEquivalentStructuredBuffer<T>(this);
+            outOriginalValue = __atomic_$(op.internalName)(buf[byteAddress / 8], value);
+            return;
         }
     }
+${{{{
+} // for (each bufferOps)
+}}}}
 
+    /// Perform a 64-bit integer atomic compare-and-exchange operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic compare-and-exchange operation.
+    /// @param compareValue The value to compare to the value at `byteAddress`.
+    /// @param value The value to store at `byteAddress` if the comparison is successful.
+    /// @param outOriginalValue The original value at `byteAddress` before the add operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function
+    /// translates to `InterlockedCompareExchange64` and requires shader model 6.6.
+    /// For CUDA, this function maps to `atomicCAS`.
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedOr64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue)
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
+    void InterlockedCompareExchange64<T:__BuiltinInt64Type>(uint byteAddress, T compareValue, T value, out T outOriginalValue)
     {
         __target_switch
         {
-        case hlsl: __intrinsic_asm ".InterlockedOr64";
-        }
-    }
-
-    // Xor
-
-    __cuda_sm_version(5.0)
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda5_int64)]
-    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint64_t>($1), $2)";
         case hlsl:
-            return __asuint64(__atomicXor(this, byteAddress, __asuint2(value)));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            return __atomicXor(buf[byteAddress / 8], value);
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedXor64(uint byteAddress, uint64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedXor64";
+            __intrinsic_asm ".InterlockedCompareExchange64";
+        default:
+            let buf = __getEquivalentStructuredBuffer<T>(this);
+            outOriginalValue = __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value);
+            return;
         }
     }
 
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedXor64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedXor64";
-        }
-    }
-
-    // Exchange
-
-    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_nvapi_cuda9_int64)]
-    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value)
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
+    void InterlockedCompareExchangeFloatBitwise(uint byteAddress, float compareValue, float value, out float outOriginalValue)
     {
         __target_switch
         {
-        case cuda: __intrinsic_asm "atomicExch($0._getPtrAt<uint64_t>($1), $2)";
-        case hlsl:
-            return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value)));
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            return __atomicExchange(buf[byteAddress / 8], value);
+        case hlsl: __intrinsic_asm ".InterlockedCompareExchangeFloatBitwise";
+        default:
+            let buf = __getEquivalentStructuredBuffer<float>(this);
+            outOriginalValue = __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value);
+            return;
         }
     }
 
+    /// Perform a floating-point atomic bitwise exchange operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic exchange operation.
+    /// @param value The value to store at `byteAddress`.
+    /// @param [out] outOriginalValue The original value at `byteAddress` before the exchange operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicExchange`. For HLSL, this function
+    /// translates to `InterlockedExchangeFloat` and requires shader model 6.6.
+    /// For CUDA, this function maps to `atomicExch`.
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
     void InterlockedExchangeFloat(uint byteAddress, float value, out float outOriginalValue)
     {
         __target_switch
         {
         case hlsl: __intrinsic_asm ".InterlockedExchangeFloat";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedExchange64(uint byteAddress, int64_t value, out int64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedExchange64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedExchange64(uint byteAddress, uint64_t value, out uint64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedExchange64";
-        }
-    }
-
-    // SM6.6 6 64bit atomics.
-    [ForceInline]
-    [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
-    void InterlockedAdd64(uint byteAddress, int64_t valueToAdd)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedAdd64";
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<int64_t>(this);
-            __atomicAdd(buf[byteAddress / 8], valueToAdd);
-        }
-    }
-
-    [ForceInline]
-    [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
-    void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedAdd64";
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<int64_t>(this);
-            outOriginalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
-            return;
-        }
-    }
-
-    [ForceInline]
-    [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
-    void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedAdd64";
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            __atomicAdd(buf[byteAddress / 8], valueToAdd);
-        }
-    }
-
-    [ForceInline]
-    [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
-    void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedAdd64";
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            outOriginalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
+        default:
+            let buf = __getEquivalentStructuredBuffer<float>(this);
+            outOriginalValue = __atomic_exchange(buf[byteAddress / 4], value);
             return;
         }
     }
 
+    /// Perform a 64-bit integer atomic compare-and-store operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic store operation.
+    /// @param compareValue The value to compare to the value at `byteAddress`.
+    /// @param value The value to store at `byteAddress` if the the value at address is equal to `compareValue`.
+    /// @param [out] outOriginalValue The original value at `byteAddress` before the store operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function
+    /// translates to `InterlockedCompareStore64` and requires shader model 6.6.
+    /// For CUDA, this function maps to `atomicCAS`.
     [ForceInline]
-    [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
-    void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl:
-            __cas(this, byteAddress, compareValue, value, outOriginalValue);
-            return;
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<int64_t>(this);
-            outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
-            return;
-        }
-    }
-
     [ForceInline]
-    [require(glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
-    void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
+    void InterlockedCompareStore64<T:__BuiltinInt64Type>(uint byteAddress, T compareValue, T value)
     {
         __target_switch
         {
-        case hlsl:
-            __cas(this, byteAddress, compareValue, value, outOriginalValue);
-            return;
-        case glsl:
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
-            outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
+        case hlsl: __intrinsic_asm ".InterlockedCompareStore64";
+        default:
+            let buf = __getEquivalentStructuredBuffer<T>(this);
+            __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value);
             return;
         }
     }
-
+   
+    /// Perform a floating-point atomic bitwise compare-and-store operation at `byteAddress`.
+    /// @param byteAddress The address at which to perform the atomic compare-and-exchange  operation.
+    /// @param compareValue The value to perform bitwise comparison to the value at `byteAddress`.
+    /// @param value The value to store at `byteAddress` if the comparison is successful.
+    /// @param [out] outOriginalValue The original value at `byteAddress` before the compare-and-exchange operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function
+    /// translates to `InterlockedCompareStoreFloatBitwise` and requires shader model 6.6.
+    /// For CUDA, this function maps to `atomicCAS`.
     [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
+    [require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda9_int64)]
     void InterlockedCompareStoreFloatBitwise(uint byteAddress, float compareValue, float value)
     {
         __target_switch
         {
         case hlsl: __intrinsic_asm ".InterlockedCompareStoreFloatBitwise";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedCompareExchangeFloatBitwise(uint byteAddress, float compareValue, float value, out float outOriginalValue)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedCompareExchangeFloatBitwise";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedCompareStore64(uint byteAddress, int64_t compareValue, int64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedCompareStore64";
-        }
-    }
-
-    [ForceInline]
-    [require(hlsl, atomic_hlsl_sm_6_6)]
-    void InterlockedCompareStore64(uint byteAddress, uint64_t compareValue, uint64_t value)
-    {
-        __target_switch
-        {
-        case hlsl: __intrinsic_asm ".InterlockedCompareStore64";
+        default:
+            let buf = __getEquivalentStructuredBuffer<float>(this);
+            __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value);
+            return;
         }
     }
 
@@ -5393,103 +4746,62 @@ ${{{{
     } // endif (type == RWByteAddressBuffer)
 }}}}
 
-    // Added operations:
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedAdd(
-        UINT dest,
-        UINT value,
-        out UINT original_value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))";
-        case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedAdd";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_add(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedAdd(buf[dest / 4], value, original_value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedAdd(
-        UINT dest,
-        UINT value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)";
-        case hlsl: __intrinsic_asm ".InterlockedAdd";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_add(__getMetalAtomicRef(buf[dest / 4]), value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedAdd(buf[dest / 4], value);
-        }
-    }
+    // 32-bit atomic operations:
+    // InterlockedMax, InterlockedMin, InterlockedAdd, InterlockedAnd, InterlockedOr, InterlockedXor, InterlockedExchange
+${{{{
+    for (auto op : bufferAtomicOps) {
+}}}}
 
+    /// Perform an atomic $(op.internalName) operation at the specified byte
+    /// location of the byte address buffer.
+    /// @param dest The byte address at which to perform the atomic $(op.internalName) operation.
+    /// @param value The operand of the atomic operation.
+    /// @param original_value The original value at `dest` before the $(op.internalName) operation.
     [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedAnd(
+    [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)]
+    void Interlocked$(op.name)(
         UINT dest,
         UINT value,
         out UINT original_value)
     {
         __target_switch
         {
-        case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedAnd";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_and(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
+        case hlsl: __intrinsic_asm ".Interlocked$(op.name)";
+        default:
             let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedAnd(buf[dest / 4], value, original_value);
+            ::Interlocked$(op.name)(buf[dest / 4], value, original_value);
         }
     }
 
     [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedAnd(
+    [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)]
+    void Interlocked$(op.name)(
         UINT dest,
         UINT value)
     {
         __target_switch
         {
-        case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)";
-        case hlsl: __intrinsic_asm ".InterlockedAnd";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_and(__getMetalAtomicRef(buf[dest / 4]), value);
-            return;
-        }
-        case spirv:
+        case hlsl: __intrinsic_asm ".Interlocked$(op.name)";
+        default:
             let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedAnd(buf[dest / 4], value);
+            ::Interlocked$(op.name)(buf[dest / 4], value);
         }
     }
+${{{{
+} // for (buffer atomic ops)
+}}}}
 
+    /// Perform a 32-bit integer atomic compare-and-exchange operation at
+    /// the specified byte address within the `RWByteAddressBuffer`.
+    /// @param dest The address at which to perform the atomic compare-and-exchange operation.
+    /// @param compare_value The value to perform bitwise comparison to the value at `byteAddress`.
+    /// @param value The value to store at `byteAddress` if the comparison is successful.
+    /// @param original_value The original value at `byteAddress` before the compare-and-exchange operation.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function
+    /// translates to `InterlockedCompareExchange`.
+    /// For CUDA, this function maps to `atomicCAS`.
     [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
+    [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)]
     void InterlockedCompareExchange(
         UINT dest,
         UINT compare_value,
@@ -5498,23 +4810,23 @@ ${{{{
     {
         __target_switch
         {
-        case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))";
-        case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))";
         case hlsl: __intrinsic_asm ".InterlockedCompareExchange";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_compare_exchange(__getMetalAtomicRef(buf[dest / 4]), compare_value, value, original_value);
-            return;
-        }
-        case spirv:
+        default:
             let buf = __getEquivalentStructuredBuffer<uint>(this);
             ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value);
         }
     }
 
+    /// Perform a 32-bit integer atomic compare-and-store operation at
+    /// the specified byte address within the `RWByteAddressBuffer`.
+    /// @param dest The address at which to perform the atomic add operation.
+    /// @param compare_value The value to perform comparison to the value at `byteAddress`.
+    /// @param value The value to store at `byteAddress` if the comparison is successful.
+    /// @remarks For SPIR-V, this function maps to `OpAtomicCompareExchange`. For HLSL, this function
+    /// translates to `InterlockedCompareStore`.
+    /// For CUDA, this function maps to `atomicCAS`.
     [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
+    [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal, byteaddressbuffer_rw)]
     void InterlockedCompareStore(
         UINT dest,
         UINT compare_value,
@@ -5522,232 +4834,13 @@ ${{{{
     {
         __target_switch
         {
-        case glsl: __intrinsic_asm "atomicCompSwap($0._data[$1/4], $2, $3)";
-        case cuda: __intrinsic_asm "atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3)";
         case hlsl: __intrinsic_asm ".InterlockedCompareStore";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_compare_exchange(__getMetalAtomicRef(buf[dest / 4]), compare_value, value);
-            return;
-        }
-        case spirv:
+        default:
             let buf = __getEquivalentStructuredBuffer<uint>(this);
             ::InterlockedCompareStore(buf[dest / 4], compare_value, value);
         }
     }
 
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedExchange(
-        UINT dest,
-        UINT value,
-        out UINT original_value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))";
-        case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedExchange";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_exchange(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedExchange(buf[dest / 4], value, original_value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedMax(
-        UINT dest,
-        UINT value,
-        out UINT original_value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))";
-        case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedMax";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_max(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedMax(buf[dest / 4], value, original_value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedMax(
-        UINT dest,
-        UINT value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)";
-        case hlsl: __intrinsic_asm ".InterlockedMax";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_max(__getMetalAtomicRef(buf[dest / 4]), value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedMax(buf[dest / 4], value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedMin(
-        UINT dest,
-        UINT value,
-        out UINT original_value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))";
-        case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedMin";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_min(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedMin(buf[dest / 4], value, original_value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedMin(
-        UINT dest,
-        UINT value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)";
-        case hlsl: __intrinsic_asm ".InterlockedMin";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_min(__getMetalAtomicRef(buf[dest / 4]), value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedMin(buf[dest / 4], value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedOr(
-        UINT dest,
-        UINT value,
-        out UINT original_value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))";
-        case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedOr";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_or(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedOr(buf[dest / 4], value, original_value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedOr(
-        UINT dest,
-        UINT value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)";
-        case hlsl: __intrinsic_asm ".InterlockedOr";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_or(__getMetalAtomicRef(buf[dest / 4]), value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedOr(buf[dest / 4], value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedXor(
-        UINT dest,
-        UINT value,
-        out UINT original_value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))";
-        case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))";
-        case hlsl: __intrinsic_asm ".InterlockedXor";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_xor(__getMetalAtomicRef(buf[dest / 4]), value, original_value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedXor(buf[dest / 4], value, original_value);
-        }
-    }
-
-    [ForceInline]
-    [require(cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
-    void InterlockedXor(
-        UINT dest,
-        UINT value)
-    {
-        __target_switch
-        {
-        case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)";
-        case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)";
-        case hlsl: __intrinsic_asm ".InterlockedXor";
-        case metal:
-        {
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            __metalInterlocked_xor(__getMetalAtomicRef(buf[dest / 4]), value);
-            return;
-        }
-        case spirv:
-            let buf = __getEquivalentStructuredBuffer<uint>(this);
-            ::InterlockedXor(buf[dest / 4], value);
-        }
-    }
 
     [ForceInline]
     [require(cpp_cuda_glsl_hlsl_metal_spirv, byteaddressbuffer_rw)]
@@ -9699,26 +8792,6 @@ void GroupMemoryBarrierWithGroupSync()
 
 // Atomics
 
-__generic<T>
-__intrinsic_op($(kIROp_MetalAtomicCast))
-[require(metal)]
-T* __getMetalAtomicRef(__ref T x);
-
-// Checks if input is a ImageSubscript
-__generic<T>
-__intrinsic_op($(kIROp_IsTextureAccess))
-bool __isTextureAccess(__ref T x);
-
-// Checks if input is a texture of T type scalar
-__generic<T>
-__intrinsic_op($(kIROp_IsTextureScalarAccess))
-bool __isTextureScalarAccess(__ref T x);
-
-// Checks if input is a texture array
-__generic<T>
-__intrinsic_op($(kIROp_IsTextureArrayAccess))
-bool __isTextureArrayAccess(__ref T x);
-
 // Accepts an ImageSubscript
 // Gets Texture used with ImageSubscript.
 __generic<TextureAccess>
@@ -9738,414 +8811,6 @@ __intrinsic_op($(kIROp_ExtractArrayCoordFromTextureAccess))
 uint __extractArrayCoordFromTextureAccess(__ref TextureAccess x);
 
 ${{{{
-for (bool isArray : {false, true})
-{
-    StringBuilder coordBuilder;
-    StringBuilder coordFetchBuilder;
-    
-    StringBuilder threeParamsASMBuilder;
-    StringBuilder threeParamsOutputParamASMBuilder;
-    
-    StringBuilder fourParamsASMBuilder;
-
-    coordBuilder << "Coord coord";
-    coordFetchBuilder << "coord";
-    
-    threeParamsASMBuilder << "$1, $2";
-
-    fourParamsASMBuilder << "$1, $2, $3";
-    if(isArray)
-    {
-        coordBuilder << ", uint arrayCoord";
-        coordFetchBuilder << ", arrayCoord";
-        threeParamsASMBuilder << ", $3";
-        fourParamsASMBuilder << ", $4";
-        threeParamsOutputParamASMBuilder << "$4";
-    }
-    else
-    {
-        threeParamsOutputParamASMBuilder << "$3";
-    }
-    auto coordString = coordBuilder.toString();
-    auto coordFetchString = coordFetchBuilder.toString();
-    
-    auto threeParamsASMString = threeParamsASMBuilder.toString();
-    auto threeParamsOutputParamASMString = threeParamsOutputParamASMBuilder.toString();
-
-    auto fourParamsASMString = fourParamsASMBuilder.toString();
-}}}}
-
-${{{{
-    for (const char* atomicOperation : {"add", "and", "max", "min", "or", "sub", "xor"})
-    {
-}}}}
-        __generic<TextureType, T, Coord>
-        [ForceInline]
-        [require(metal)]
-        vector<T, 4> __metalImageInterlocked_$(atomicOperation)(TextureType tex, $(coordString), vector<T, 4> value)
-        {
-            static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures");
-            static_assert(Coord is uint || Coord is vector<uint,2> || Coord is vector<uint,3> || Coord is vector<uint,4>,
-                            "__metalImageInterlocked implementation only allows 'uint' coordinates");
-            __intrinsic_asm "$0.atomic_fetch_$(atomicOperation)($(threeParamsASMString))";
-        }
-
-        __generic<TextureType, T, Coord>
-        [ForceInline]
-        [require(metal)]
-        void __metalImageInterlocked_$(atomicOperation)(TextureType tex, $(coordString), vector<T, 4> value, out T original_value)
-        {
-            static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures");
-            static_assert(Coord is uint || Coord is vector<uint,2> || Coord is vector<uint,3> || Coord is vector<uint,4>,
-                    "__metalImageInterlocked implementation only allows 'uint' coordinates");
-            original_value = __metalImageInterlocked_$(atomicOperation)(tex, $(coordFetchString), value)[0];
-        }
-${{{{
-    } // atomicOperation
-}}}}
-
-    __generic<TextureType, T, Coord>
-    [ForceInline]
-    [require(metal)]
-    vector<T, 4> __metalImageInterlocked_exchange(TextureType tex, $(coordString), vector<T, 4> value)
-    {
-        static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures");
-        static_assert(Coord is uint || Coord is vector<uint,2> || Coord is vector<uint,3> || Coord is vector<uint,4>,
-                    "__metalImageInterlocked implementation only allows 'uint' coordinates");
-        __intrinsic_asm "($0.atomic_exchange($(threeParamsASMString)))";
-    }
-    __generic<TextureType, T, Coord>
-    [ForceInline]
-    [require(metal)]
-    void __metalImageInterlocked_exchange(TextureType tex, $(coordString), vector<T, 4> value, out T original_value)
-    {
-        static_assert(T is int || T is uint, "Metal atomic texture operations only allow 'int'/'uint' textures");
-        static_assert(Coord is uint || Coord is vector<uint,2> || Coord is vector<uint,3> || Coord is vector<uint,4>,
-                    "__metalImageInterlocked implementation only allows 'uint' coordinates");
-        original_value = __metalImageInterlocked_exchange(tex, $(coordFetchString), value)[0];
-    }
-
-    __generic<TextureType, T, Coord>
-    [ForceInline]
-    [require(metal)]
-    void __metalImageInterlocked_compare_exchange(TextureType tex, $(coordString), __ref vector<T, 4> compare_value, vector<T, 4> value)
-    {
-        static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures");
-        static_assert(Coord is uint || Coord is vector<uint,2> || Coord is vector<uint,3> || Coord is vector<uint,4>,
-                    "__metalImageInterlocked implementation only allows 'uint' coordinates");
-        __intrinsic_asm "($0.atomic_compare_exchange_weak($(fourParamsASMString)))";
-    }
-    __generic<TextureType, T, Coord>
-    [ForceInline]
-    [require(metal)]
-    void __metalImageInterlocked_compare_exchange(TextureType tex, $(coordString), vector<T, 4> compare_value, vector<T, 4> value, out T original_value)
-    {
-        static_assert(T is int || T is uint, "__metalImageInterlocked only allows 'int'/'uint' textures");
-        static_assert(Coord is uint || Coord is vector<uint,2> || Coord is vector<uint,3> || Coord is vector<uint,4>,
-                    "__metalImageInterlocked implementation only allows 'uint' coordinates");
-        __metalImageInterlocked_compare_exchange(tex, $(coordFetchString), compare_value, value);
-        original_value = compare_value[0];
-    }
-
-${{{{
-} // isArray
-}}}}
-
-${{{{
-
-// Generated functions:
-
-// atomicAdd, InterlockedAdd, atomic_fetch_add_explicit, OpAtomicIAdd, OpAtomicFAddEXT
-// __cudaInterlocked_add, __glslInterlocked_add, __hlslInterlocked_add, __metalInterlocked_add, __spirvInterlocked_add
-
-// atomicAnd, InterlockedAnd, atomic_fetch_and_explicit, OpAtomicAnd
-// __cudaInterlocked_and, __glslInterlocked_and, __hlslInterlocked_and, __metalInterlocked_and, __spirvInterlocked_and
-
-// atomicMax, InterlockedMax, atomic_fetch_max_explicit, OpAtomicUMax, OpAtomicSMax, OpAtomicFMaxEXT
-// __cudaInterlocked_max, __glslInterlocked_max, __hlslInterlocked_max, __metalInterlocked_max, __spirvInterlocked_max
-
-// atomicMin, InterlockedMin, atomic_fetch_min_explicit, OpAtomicUMin, OpAtomicSMin, OpAtomicFMinEXT
-// __cudaInterlocked_min, __glslInterlocked_min, __hlslInterlocked_min, __metalInterlocked_min, __spirvInterlocked_min
-
-// atomicOr, InterlockedOr, atomic_fetch_or_explicit, OpAtomicOr
-// __cudaInterlocked_or, __glslInterlocked_or, __hlslInterlocked_or, __metalInterlocked_or, __spirvInterlocked_or
-
-// atomicXor, InterlockedXor, atomic_fetch_xor_explicit, OpAtomicXor
-// __cudaInterlocked_xor, __glslInterlocked_xor, __hlslInterlocked_xor, __metalInterlocked_xor, __spirvInterlocked_xor
-
-// atomicExchange, atomicExch, InterlockedExchange, atomic_exchange_explicit, OpAtomicExchange
-// __cudaInterlocked_exchange, __glslInterlocked_exchange, __hlslInterlocked_exchange, __metalInterlocked_exchange, __spirvInterlocked_exchange
-
-struct InternalAtomicOperationInfo
-{
-    const char* slangSuffix;
-    const char* cudaSuffix;
-    const char* glslSuffix;
-    const char* hlslSuffix;
-    const char* metalSuffix;
-    const char* spirvFloatSuffix;
-    const char* spirvUIntSuffix;
-    const char* spirvIntSuffix;
-
-    const char* assertExpr;
-};
-
-InternalAtomicOperationInfo internalAtomicOperationInfo[7] = {
-    { "add", "Add", "Add", "Add", "fetch_add", "FAddEXT", "IAdd", "IAdd", "true" },
-    { "and", "And", "And", "And", "fetch_and", "And", "And", "And", "!__isFloat<T>()" },
-    { "max", "Max", "Max", "Max", "fetch_max", "FMaxEXT", "UMax", "SMax", "true" },
-    { "min", "Min", "Min", "Min", "fetch_min", "FMinEXT", "UMin", "SMin", "true" },
-    { "or", "Or", "Or", "Or", "fetch_or", "Or", "Or", "Or", "!__isFloat<T>()" },
-    { "xor", "Xor", "Xor", "Xor", "fetch_xor", "Xor", "Xor", "Xor", "!__isFloat<T>()" },
-    { "exchange", "Exch", "Exchange", "Exchange", "exchange", "Exchange", "Exchange", "Exchange", "true" },
-};
-
-for (InternalAtomicOperationInfo atomicOp : internalAtomicOperationInfo)
-{
-}}}}
-    __generic<AtomicType, T>
-    [ForceInline]
-    [require(metal)]
-    void __metalInterlocked_$(atomicOp.slangSuffix)(AtomicType dest, T value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "atomic_$(atomicOp.metalSuffix)_explicit($0, $1, memory_order_relaxed)";
-    }
-
-    __generic<AtomicType, T>
-    [ForceInline]
-    [require(metal)]
-    void __metalInterlocked_$(atomicOp.slangSuffix)(AtomicType dest, T value, out T original_value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "((*($2)) = (atomic_$(atomicOp.metalSuffix)_explicit($0, $1, memory_order_relaxed)))";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(cuda)]
-    void __cudaInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "atomic$(atomicOp.cudaSuffix)((int*)$0, $1)";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(cuda)]
-    void __cudaInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "(*$2 = atomic$(atomicOp.cudaSuffix)((int*)$0, $1))";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(glsl)]
-    void __glslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "$atomic$(atomicOp.glslSuffix)($A, $1)";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(glsl)]
-    void __glslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "($2 = $atomic$(atomicOp.glslSuffix)($A, $1))";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(hlsl)]
-    void __hlslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "Interlocked$(atomicOp.hlslSuffix)";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(hlsl)]
-    void __hlslInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        __intrinsic_asm "Interlocked$(atomicOp.hlslSuffix)";
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(spirv)]
-    void __spirvInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        if (__isFloat<T>())
-        {
-            spirv_asm
-            {
-                result:$$T = OpAtomic$(atomicOp.spirvFloatSuffix) &dest Device None $value
-            };
-        }
-        else if (__isUnsignedInt<T>())
-        {
-            spirv_asm
-            {
-                result:$$T = OpAtomic$(atomicOp.spirvUIntSuffix) &dest Device None $value
-            };
-        }
-        else if (__isInt<T>())
-        {
-            spirv_asm
-            {
-                result:$$T = OpAtomic$(atomicOp.spirvIntSuffix) &dest Device None $value
-            };
-        }
-    }
-
-    __generic<T>
-    [ForceInline]
-    [require(spirv)]
-    void __spirvInterlocked_$(atomicOp.slangSuffix)(__ref T dest, T value, out T original_value)
-    {
-        static_assert($(atomicOp.assertExpr), "Unable to use float with Atomic$(atomicOp.slangSuffix)"); 
-        if (__isFloat<T>())
-        {
-            spirv_asm
-            {
-                %original:$$T = OpAtomic$(atomicOp.spirvFloatSuffix) &dest Device None $value;
-                OpStore &original_value %original
-            };
-        }
-        else if (__isUnsignedInt<T>())
-        {
-            spirv_asm
-            {
-                %original:$$T = OpAtomic$(atomicOp.spirvUIntSuffix) &dest Device None $value;
-                OpStore &original_value %original
-            };
-        }
-        else if (__isInt<T>())
-        {
-            spirv_asm
-            {
-                %original:$$T = OpAtomic$(atomicOp.spirvIntSuffix) &dest Device None $value;
-                OpStore &original_value %original
-            };
-        }
-    }
-
-${{{{
-} // fetchAndModify
-}}}}
-
-__generic<AtomicType, T>
-[ForceInline]
-[require(metal)]
-void __metalInterlocked_compare_exchange(AtomicType dest, __ref T compare_value, T value)
-{
-    __intrinsic_asm "atomic_compare_exchange_weak_explicit($0, $1, $2, memory_order_relaxed, memory_order_relaxed)";
-}
-
-__generic<AtomicType, T>
-[ForceInline]
-[require(metal)]
-void __metalInterlocked_compare_exchange(AtomicType dest, T compare_value, T value, out T original_value)
-{
-    __metalInterlocked_compare_exchange(dest, compare_value, value);
-    original_value = compare_value;
-}
-
-__generic<T>
-__glsl_version(430)
-[ForceInline]
-[require(cuda)]
-void __cudaInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value)
-{
-    __intrinsic_asm "atomicCAS($0, $1, $2)";
-}
-
-__generic<T>
-[ForceInline]
-[require(cuda)]
-void __cudaInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value)
-{
-    __intrinsic_asm "*$3 = atomicCAS($0, $1, $2)";
-}
-
-__generic<T>
-[ForceInline]
-[require(glsl)]
-void __glslInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value)
-{
-    __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
-}
-
-__generic<T>
-[ForceInline]
-[require(glsl)]
-void __glslInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value)
-{
-    __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
-}
-
-__generic<T>
-[ForceInline]
-[require(hlsl)]
-void __hlslInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value)
-{
-    __intrinsic_asm "InterlockedCompareExchange";
-}
-
-__generic<T>
-[ForceInline]
-[require(hlsl)]
-void __hlslInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value)
-{
-    __intrinsic_asm "InterlockedCompareExchange";
-}
-
-__generic<T>
-[ForceInline]
-[require(spirv)]
-void __spirvInterlocked_compare_exchange(__ref T dest, __ref T compare_value, T value)
-{
-    spirv_asm
-    {
-        %result:$$T = OpAtomicCompareExchange &dest Device None None $value $compare_value;
-    };
-}
-
-__generic<T>
-[ForceInline]
-[require(spirv)]
-void __spirvInterlocked_compare_exchange(__ref T dest, T compare_value, T value, out T original_value)
-{
-    spirv_asm
-    {
-        %original:$$T = OpAtomicCompareExchange &dest Device None None $value $compare_value;
-        OpStore &original_value %original
-    };
-}
-
-__generic<T>
-[ForceInline]
-[require(hlsl)]
-void __hlslInterlocked_compare_exchange_float_bitwise(__ref T dest, T compare_value, T value)
-{
-    __intrinsic_asm "InterlockedCompareExchangeFloatBitwise";
-}
-
-__generic<T>
-[ForceInline]
-[require(hlsl)]
-void __hlslInterlocked_compare_exchange_float_bitwise(__ref T dest, T compare_value, T value, out T original_value)
-{
-    __intrinsic_asm "InterlockedCompareExchangeFloatBitwise";
-}
-
-${{{{
 // Generates code for:
 // InterlockedAdd, InterlockedAnd, InterlockedOr, InterlockedXor,
 // InterlockedMax, InterlockedMin, InterlockedExchange
@@ -10153,516 +8818,166 @@ struct SlangAtomicOperationInfo
 {
     const char* slangCallSuffix;
     const char* internalCallSuffix;
+    const char* interface;
 };
 
 SlangAtomicOperationInfo slangAtomicOperationInfo[7] = {
-    { "Add", "add" },
-    { "And", "and" },
-    { "Or", "or" },
-    { "Xor", "xor" },
-    { "Max", "max" },
-    { "Min", "min" },
-    { "Exchange", "exchange" },
+    { "Add", "add", "IArithmeticAtomicable" },
+    { "And", "and", "IArithmeticAtomicable" },
+    { "Or", "or", "IArithmeticAtomicable" },
+    { "Xor", "xor", "IArithmeticAtomicable" },
+    { "Max", "max", "IArithmeticAtomicable" },
+    { "Min", "min", "IArithmeticAtomicable" },
+    { "Exchange", "exchange", "IAtomicable" },
 };
 
 for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo)
 {
-    for(const char* T : {"int", "uint"})
-    {
 }}}}
 
+/// Perform an atomic $(atomicOp.internalCallSuffix) operation on `dest`.
+/// @param T The type of the value to perform the atomic operation on.
+/// @param dest The value to perform the atomic operation on.
+/// @param value The operand to the atomic operation.
+/// @param original_value The value of `dest` before the operation.
+/// @remarks When targeting HLSL, it is invalid to call this function with `T` being a floating-point type, since
+/// HLSL does not allow atomic operations on floating point types. For `InterlockedAdd`, consider using
+/// `RWByteAddressBuffer.InterlockedAddF32` or `RWByteAddressBuffer.InterlockedAddF16` instead when NVAPI is available.
+/// On SPIR-V (Vulkan), all integer and floating point types are supported.
+/// On Metal and WGSL, all floating-point types are not supported.
+/// @category atomic Atomic functions
 [ForceInline]
 __glsl_version(430)
 [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
-void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest,  $(T) value)
+void Interlocked$(atomicOp.slangCallSuffix)<T:$(atomicOp.interface)>(__ref T dest,  T value)
 {
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_$(atomicOp.internalCallSuffix)(dest, value);
-    case cuda: __cudaInterlocked_$(atomicOp.internalCallSuffix)(dest, value);
-    case glsl: __glslInterlocked_$(atomicOp.internalCallSuffix)(dest, value);
-    case spirv: __spirvInterlocked_$(atomicOp.internalCallSuffix)(dest, value);
-    case metal:
-        if (__isTextureAccess(dest))
-        {
-            if(__isTextureArrayAccess(dest))
-            {
-                __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vector<$(T), 4>(value));
-            }
-            else
-            {
-                __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), vector<$(T), 4>(value));
-            }
-        }
-        else
-        {
-            __metalInterlocked_$(atomicOp.internalCallSuffix)(__getMetalAtomicRef(dest), value);
-        }
-        return;
-    }
+    __atomic_$(atomicOp.internalCallSuffix)(dest, value);
 }
 
 [ForceInline]
 __glsl_version(430)
 [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
-void Interlocked$(atomicOp.slangCallSuffix)(__ref $(T) dest, $(T) value, out $(T) original_value)
+void Interlocked$(atomicOp.slangCallSuffix)<T:$(atomicOp.interface)>(__ref T dest, T value, out T original_value)
 {
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to a scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value);
-    case cuda: __cudaInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value);
-    case glsl: __glslInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value);
-    case spirv: __spirvInterlocked_$(atomicOp.internalCallSuffix)(dest, value, original_value);
-    case metal:
-        if (__isTextureAccess(dest))
-            if(__isTextureArrayAccess(dest))
-            {
-                __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vector<$(T),4>(value), original_value);
-            }
-            else
-            {
-                __metalImageInterlocked_$(atomicOp.internalCallSuffix)(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), vector<$(T),4>(value), original_value);
-            }
-        else
-            __metalInterlocked_$(atomicOp.internalCallSuffix)(__getMetalAtomicRef(dest), value, original_value);
-        return;
-    }
+    original_value = __atomic_$(atomicOp.internalCallSuffix)(dest, value);
 }
 
-${{{{
-    } // for(const char* T : {"int64_t", "uint64_t"})
-}}}}
-
 [ForceInline]
+__glsl_version(430)
+[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
 void Interlocked$(atomicOp.slangCallSuffix)(__ref uint dest, int value)
 {
-    Interlocked$(atomicOp.slangCallSuffix)(dest, (uint)value);
+    __atomic_$(atomicOp.internalCallSuffix)(dest, (uint)value);
 }
 
 ${{{{
 } // for (SlangAtomicOperationInfo atomicOp : slangAtomicOperationInfo)
 }}}}
 
-${{{{
-for(const char* T : {"int64_t", "uint64_t"})
-{
-}}}}
-/// @category atomic Atomic functions
-[ForceInline]
-[require(cuda_glsl_hlsl_spirv, atomic_glsl_hlsl_cuda_metal)]
-void InterlockedAdd(__ref $(T) dest, $(T) value)
-{   
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_add(dest, value);
-    case cuda: __cudaInterlocked_add(dest, value);
-    case glsl:
-        __requireGLSLExtension("GL_EXT_shader_atomic_int64");
-        __glslInterlocked_add(dest, value);
-    case spirv:
-        spirv_asm
-        {
-            OpCapability Int64Atomics;
-            result:$$$(T) = OpAtomicIAdd &dest Device None $value;
-        };
-    }
-}
-
-[ForceInline]
-void InterlockedAdd(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_add(dest, value, original_value);
-    case cuda: __cudaInterlocked_add(dest, value, original_value);
-    case glsl:
-        __requireGLSLExtension("GL_EXT_shader_atomic_int64");
-        __glslInterlocked_add(dest, value, original_value);
-    case spirv:
-        spirv_asm
-        {
-            OpCapability Int64Atomics;
-            %origin:$$$(T) = OpAtomicIAdd &dest Device None $value;
-            OpStore &original_value %origin
-        };
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedAnd(__ref $(T) dest, $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_and(dest, value);
-    }
-}
-
-[ForceInline]
-void InterlockedAnd(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_and(dest, value, original_value);
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value);
-    }
-}
-
-[ForceInline]
-void InterlockedCompareExchange(__ref $(T) dest, $(T) compare_value, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    }
-}
-
-[ForceInline]
-void InterlockedCompareStore(__ref $(T) dest, $(T) compare_value, $(T) value);
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedCompareStore";
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedExchange(__ref $(T) dest, $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedExchange";
-    }
-}
-
-[ForceInline]
-void InterlockedExchange(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedExchange";
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedMax(__ref $(T) dest, $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedMax";
-    }
-}
-
-[ForceInline]
-void InterlockedMax(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedMax";
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedMin(__ref $(T) dest, $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedMin";
-    }
-}
-
-[ForceInline]
-void InterlockedMin(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedMin";
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedOr(__ref  $(T) dest,  $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedOr";
-    }
-}
-
-[ForceInline]
-void InterlockedOr(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedOr";
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedXor(__ref $(T) dest, $(T) value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedXor";
-    }
-}
-
-[ForceInline]
-void InterlockedXor(__ref $(T) dest, $(T) value, out $(T) original_value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedXor";
-    }
-}
-
-${{{{
-} // for(const char* T : {"int64_t", "uint64_t"})
-}}}}
-
+/// Perform an atomic compare and exchange operation on `dest`.
+/// @param T The type of the value to perform the atomic operation on.
+/// @param dest The value to perform the atomic operation on.
+/// @param compare_value The value to compare `dest` with.
+/// @param value The value to store into `dest` if the compare result is equal.
+/// @param original_value The value of `dest` before the operation.
+/// @remarks When targeting HLSL, a call to this function with `T` being `float` will translate to a call to
+/// `InterlockedCompareExchangeFloatBitwise`, which means the comparison is done as a bitwise comparison.
+///
+/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`.
+///
+/// On Metal and WGSL, all floating-point types are not supported.
+///
+/// On CUDA, this function maps to `atomicCAS`.
 /// @category atomic
 [ForceInline]
-__glsl_version(430)
 [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
-void InterlockedCompareExchange(__ref int dest, int compare_value, int value, out int original_value)
+void InterlockedCompareExchange<T:IAtomicable>(__ref T dest, T compare_value, T value, out T original_value)
 {
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case glsl: __glslInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case cuda: __cudaInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case spirv: __spirvInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case metal:
-        if (__isTextureAccess(dest))
-        {
-            vector<int, 4> vec_compare_value = vector<int, 4>(compare_value);
-            if(__isTextureArrayAccess(dest))
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector<int, 4>(value), original_value);
-            }
-            else
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), vec_compare_value, vector<int, 4>(value), original_value);
-            }
-        }
-        else
-        {
-            __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value);
-        }
-        return;
-    }
-}
-
-[ForceInline]
-__glsl_version(430)
-[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
-void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value)
-{
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case cuda: __cudaInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case glsl: __glslInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case spirv: __spirvInterlocked_compare_exchange(dest, compare_value, value, original_value);
-    case metal:
-        if (__isTextureAccess(dest))
-        {
-            vector<uint, 4> vec_compare_value = vector<uint, 4>(compare_value);
-            if(__isTextureArrayAccess(dest))
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector<uint, 4>(value), original_value);
-            }
-            else
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), vec_compare_value, vector<uint, 4>(value), original_value);
-            }
-        }
-        else
-        {
-            __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value);
-        }
-        return;
-    }
+    original_value = __atomic_compare_exchange(dest, compare_value, value);
 }
 
+/// Perform an atomic compare and exchange operation on `dest`.
+/// @param T The type of the value to perform the atomic operation on.
+/// @param dest The value to perform the atomic operation on.
+/// @param compare_value The value to compare `dest` with.
+/// @param value The value to store into `dest` if the compare result is equal.
+/// @param original_value The value of `dest` before the operation.
+/// @remarks When targeting HLSL, a call to this function will translate to a call to
+/// `InterlockedCompareExchangeFloatBitwise`, which means the comparison is done as a bitwise comparison.
+///
+/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`.
+///
+/// On Metal and WGSL, this function is not available.
+///
+/// On CUDA, this function maps to `atomicCAS`.
 /// @category atomic
 [ForceInline]
 void InterlockedCompareExchangeFloatBitwise(__ref  float dest, float compare_value, float value)
 {
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_compare_exchange_float_bitwise(dest, compare_value, value);
-    case metal:
-        static_assert(!__isTextureAccess(dest), "float atomic texture operations are disallowed with Metal target's");
-        __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value);
-        return;
-    }
+    __atomic_compare_exchange(dest, compare_value, value);
 }
 
 [ForceInline]
 void InterlockedCompareExchangeFloatBitwise(__ref  float dest, float compare_value, float value, out float original_value)
 {
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_compare_exchange_float_bitwise(dest, compare_value, value, original_value);
-    case metal:
-        static_assert(!__isTextureAccess(dest), "float atomic texture operations are disallowed with Metal target's");
-        __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value, original_value);
-        return;
-    }
+    original_value = __atomic_compare_exchange(dest, compare_value, value);
 }
 
+/// Perform an atomic compare and store operation on `dest`.
+/// @param T The type of the value to perform the atomic operation on.
+/// @param dest The value to perform the atomic operation on.
+/// @param compare_value The value to compare `dest` with.
+/// @param value The value to store into `dest` if the compare result is equal.
+/// @remarks When targeting HLSL, a call to this function with `T` being `float` will translate to a call to
+/// `InterlockedCompareStoreFloatBitwise`, which means the comparison is done as a bitwise comparison.
+///
+/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`.
+///
+/// On Metal and WGSL, this function is not available.
+///
+/// On CUDA, this function maps to `atomicCAS`.
 /// @category atomic
 [ForceInline]
 __glsl_version(430)
 [require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
-void InterlockedCompareStore(__ref int dest,  int compare_value,  int value)
+void InterlockedCompareStore<T:IAtomicable>(__ref T dest,  T compare_value,  T value)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "InterlockedCompareStore";
-    case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
-    case cuda: __intrinsic_asm "atomicCAS($0, $1, $2)";
-    case spirv:
-    {
-        spirv_asm
-        {
-            result:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value;
-        };
-        return;
-    }
-    case metal:
-    {
-        if (__isTextureAccess(dest))
-        {
-            vector<int, 4> vec_compare_value = vector<int, 4>(compare_value);
-            if(__isTextureArrayAccess(dest))
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector<int, 4>(value));
-            }
-            else
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), vec_compare_value, vector<int, 4>(value));
-            }
-        }
-        else
-        {
-            __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value);
-        }
-        return;
-    }
-    }
-}
-
-[ForceInline]
-__glsl_version(430)
-[require(cuda_glsl_hlsl_metal_spirv, atomic_glsl_hlsl_cuda_metal)]
-void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value)
-{
-    __target_switch
-    {
-    case hlsl: __intrinsic_asm "InterlockedCompareStore";
-    case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
-    case cuda: __intrinsic_asm "atomicCAS((int*)$0, $1, $2)";
-    case spirv:
-        spirv_asm
-        {
-            result:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value;
-        };
-    case metal:
-        if (__isTextureAccess(dest))
-        {
-            vector<uint, 4> vec_compare_value = vector<uint, 4>(compare_value);
-            if(__isTextureArrayAccess(dest))
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), __extractArrayCoordFromTextureAccess(dest), vec_compare_value, vector<uint, 4>(value));
-            }
-            else
-            {
-                __metalImageInterlocked_compare_exchange(__extractTextureFromTextureAccess(dest), 
-                    __extractCoordFromTextureAccess(dest), vec_compare_value, vector<uint, 4>(value));
-            }
-        }
-        else
-        {
-            __metalInterlocked_compare_exchange(__getMetalAtomicRef(dest), compare_value, value);
-        }
+    default:
+        __atomic_compare_exchange(dest, compare_value, value);
         return;
     }
 }
 
+/// Perform an atomic compare and store operation on `dest`.
+/// @param T The type of the value to perform the atomic operation on.
+/// @param dest The value to perform the atomic operation on.
+/// @param compare_value The value to compare `dest` with.
+/// @param value The value to store into `dest` if the compare result is equal.
+/// @remarks When targeting HLSL, a call to this function will translate to a call to
+/// `InterlockedCompareStoreFloatBitwise`, which means the comparison is done as a bitwise comparison.
+///
+/// On SPIR-V (Vulkan), this function maps to `OpAtomicCompareExchange`.
+///
+/// On Metal and WGSL, this function is not available.
+///
+/// On CUDA, this function maps to `atomicCAS`.
 /// @category atomic
 [ForceInline]
-void InterlockedCompareStoreFloatBitwise(__ref  float dest,  float compare_value, float value)
+void InterlockedCompareStoreFloatBitwise<T:IAtomicable>(__ref  T dest,  T compare_value, T value)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "InterlockedCompareStoreFloatBitwise";
-    }
-}
-
-/// @category atomic
-[ForceInline]
-void InterlockedExchange(__ref  float dest,  float value)
-{
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_exchange(dest, value);
-    case metal:
-        static_assert(!__isTextureAccess(dest), "'float' atomic texture operations are disallowed with Metal target's");
-        __metalInterlocked_exchange(__getMetalAtomicRef(dest), value);
-        return;
-    }
-}
-
-[ForceInline]
-void InterlockedExchange(__ref  float dest,  float value, out  float original_value)
-{
-    static_assert(__isTextureScalarAccess(dest) || !__isTextureAccess(dest), "Atomic must be applied to scalar texture or non-texture");
-    __target_switch
-    {
-    case hlsl: __hlslInterlocked_exchange(dest, value, original_value);
-    case metal:
-        static_assert(!__isTextureAccess(dest), "'float' atomic texture operations are disallowed with Metal target's");
-        __metalInterlocked_exchange(__getMetalAtomicRef(dest), value, original_value);
+    default:
+        __atomic_compare_exchange(dest, compare_value, value);
         return;
     }
 }
 
 
-
 /// Test if a floating-point value finite.
 /// @category math
 __generic<T : __BuiltinFloatingPointType>
@@ -21245,13 +19560,13 @@ extension _Texture<float, Shape, 0, 0, 0, $(kStdlibResourceAccessReadWrite), 0,
     {
         __target_switch
         {
-        case spirv:
-            originalValue = __atomicAdd(this[coord], value);
+        default:
+            originalValue = __atomic_add(this[coord], value);
             return;
-        case glsl:
-            __intrinsic_asm "$3 = imageAtomicAdd($0, $1, $2)";
         case hlsl:
             __intrinsic_asm "$3 = NvInterlockedAddFp32($0, $1, $2)";
+        case glsl:
+            __intrinsic_asm "$3 = imageAtomicAdd($0, $1, $2)";
         }
     }
 
diff --git a/source/slang/slang-diagnostic-defs.h b/source/slang/slang-diagnostic-defs.h
index 298c79f7e..48b296ce3 100644
--- a/source/slang/slang-diagnostic-defs.h
+++ b/source/slang/slang-diagnostic-defs.h
@@ -883,6 +883,7 @@ DIAGNOSTIC(55200, Error, unsupportedBuiltinType, "'$0' is not a supported builti
 DIAGNOSTIC(55201, Error, unsupportedRecursion, "recursion detected in call to '$0', but the current code generation target does not allow recursion.")
 DIAGNOSTIC(55202, Error, systemValueAttributeNotSupported, "system value semantic '$0' is not supported for the current target.")
 DIAGNOSTIC(55203, Error, systemValueTypeIncompatible, "system value semantic '$0' should have type '$1' or be convertible to type '$1'.")
+DIAGNOSTIC(55204, Error, unsupportedTargetIntrinsic, "intrinsic operation '$0' is not supported for the current target.")
 DIAGNOSTIC(56001, Error, unableToAutoMapCUDATypeToHostType, "Could not automatically map '$0' to a host type. Automatic binding generation failed for '$1'")
 DIAGNOSTIC(56002, Error, attemptToQuerySizeOfUnsizedArray, "cannot obtain the size of an unsized array.")
 
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index b113f726e..79a9b1a56 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -2472,6 +2472,16 @@ void CLikeSourceEmitter::defaultEmitInstExpr(IRInst* inst, const EmitOpInfo& inO
         }
         break;
 
+    case kIROp_GetEquivalentStructuredBuffer:
+        {
+            auto base = inst->getOperand(0);
+            emitOperand(base, outerPrec);
+            m_writer->emit(".asStructuredBuffer<");
+            emitType(as<IRHLSLStructuredBufferTypeBase>(inst->getDataType())->getElementType());
+            m_writer->emit(">()");
+        }
+        break;
+
     case kIROp_RWStructuredBufferStore:
         {
             auto base = inst->getOperand(0);
diff --git a/source/slang/slang-emit-c-like.h b/source/slang/slang-emit-c-like.h
index 3cccad9e6..f0d703b40 100644
--- a/source/slang/slang-emit-c-like.h
+++ b/source/slang/slang-emit-c-like.h
@@ -260,7 +260,6 @@ public:
     bool hasExplicitConstantBufferOffset(IRInst* cbufferType);
     bool isSingleElementConstantBuffer(IRInst* cbufferType);
     bool shouldForceUnpackConstantBufferElements(IRInst* cbufferType);
-
     //
     // Expressions
     //
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 81bcafeb3..7d104ff1b 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -515,7 +515,17 @@ bool CUDASourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitInstResultDecl(inst);
         m_writer->emit("atomicAdd(");
+        bool needCloseTypeCast = false;
+        if (inst->getDataType()->getOp() == kIROp_Int64Type)
+        {
+            m_writer->emit("(unsigned long long*)(");
+            needCloseTypeCast = true;
+        }
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (needCloseTypeCast)
+        {
+            m_writer->emit(")");
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -525,7 +535,17 @@ bool CUDASourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitInstResultDecl(inst);
         m_writer->emit("atomicAdd(");
+        bool needCloseTypeCast = false;
+        if (inst->getDataType()->getOp() == kIROp_Int64Type)
+        {
+            m_writer->emit("(unsigned long long*)(");
+            needCloseTypeCast = true;
+        }
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (needCloseTypeCast)
+        {
+            m_writer->emit(")");
+        }
         m_writer->emit(", -(");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit("));\n");
diff --git a/source/slang/slang-emit-glsl.cpp b/source/slang/slang-emit-glsl.cpp
index ca5569602..7f8bc14b4 100644
--- a/source/slang/slang-emit-glsl.cpp
+++ b/source/slang/slang-emit-glsl.cpp
@@ -2153,8 +2153,50 @@ bool GLSLSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
     return false;
 }
 
+static IRImageSubscript* isTextureAccess(IRInst* inst)
+{
+    return as<IRImageSubscript>(getRootAddr(inst->getOperand(0)));
+}
+
+void GLSLSourceEmitter::emitAtomicImageCoord(IRImageSubscript* inst)
+{
+    emitOperand(inst->getImage(), getInfo(EmitOp::General));
+    m_writer->emit(", ");
+    if (auto vecType = as<IRVectorType>(inst->getCoord()->getDataType()))
+    {
+        m_writer->emit("ivec");
+        m_writer->emit(getIntVal(vecType->getElementCount()));
+    }
+    else
+    {
+        m_writer->emit("int");
+    }
+    m_writer->emit("(");
+    emitOperand(inst->getCoord(), getInfo(EmitOp::General));
+    m_writer->emit(")");
+    if (inst->hasSampleCoord())
+    {
+        m_writer->emit(", ");
+        emitOperand(inst->getSampleCoord(), getInfo(EmitOp::General));
+    }
+}
+
 bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
 {
+    auto requireAtomicExtIfNeeded = [&]()
+        {
+            if (isFloatingType(inst->getDataType()))
+            {
+                _requireGLSLExtension(toSlice("GL_EXT_shader_atomic_float"));
+            }
+            if (isIntegralType(inst->getDataType()))
+            {
+                if (getIntTypeInfo(inst->getDataType()).width == 64)
+                {
+                    _requireGLSLExtension(toSlice("GL_EXT_shader_atomic_int64"));
+                }
+            }
+        };
     switch (inst->getOp())
     {
     case kIROp_StructuredBufferGetDimensions:
@@ -2176,24 +2218,52 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     case kIROp_AtomicLoad:
     {
         emitInstResultDecl(inst);
-        emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageLoad(");
+            emitAtomicImageCoord(imageSubscript);
+            m_writer->emit(")");
+        }
+        else
+        {
+            emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(";\n");
         return true;
     }
     case kIROp_AtomicStore:
     {
-        emitInstResultDecl(inst);
-        emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(" = ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(";\n");
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageStore(");
+            emitAtomicImageCoord(imageSubscript);
+            m_writer->emit(", ");
+            emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+            m_writer->emit(")");
+        }
+        else
+        {
+            emitDereferenceOperand(inst->getOperand(0), getInfo(EmitOp::General));
+            m_writer->emit(" = ");
+            emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+            m_writer->emit(";\n");
+        }
         return true;
     }
     case kIROp_AtomicExchange:
     {
+        requireAtomicExtIfNeeded();
         emitInstResultDecl(inst);
-        m_writer->emit("atomicExchange(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicExchange(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicExchange(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2201,9 +2271,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicCompareExchange:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicCompSwap(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicCompSwap(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicCompSwap(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(", ");
@@ -2213,9 +2293,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicAdd:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicAdd(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicAdd(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicAdd(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2223,9 +2313,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicSub:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicAdd(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicAdd(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicAdd(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", -(");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit("));\n");
@@ -2233,9 +2333,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicAnd:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicAnd(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicAnd(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicAnd(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2243,9 +2353,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicOr:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicOr(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicOr(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicOr(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2253,9 +2373,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicXor:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicXor(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicXor(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicXor(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2263,9 +2393,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicMin:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicMin(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicMin(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicMin(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2273,9 +2413,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicMax:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicMax(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicMax(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicMax(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(");\n");
@@ -2283,9 +2433,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicInc:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicAdd(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicAdd(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicAdd(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitType(inst->getDataType());
         m_writer->emit("(1)");
@@ -2294,9 +2454,19 @@ bool GLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicDec:
     {
+        requireAtomicExtIfNeeded();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomicAdd(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            m_writer->emit("imageAtomicAdd(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomicAdd(");
+            emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        }
         m_writer->emit(", ");
         emitType(inst->getDataType());
         m_writer->emit("(-1)");
diff --git a/source/slang/slang-emit-glsl.h b/source/slang/slang-emit-glsl.h
index 8958c7608..12ab60e46 100644
--- a/source/slang/slang-emit-glsl.h
+++ b/source/slang/slang-emit-glsl.h
@@ -133,6 +133,8 @@ protected:
 
     void _emitSpecialFloatImpl(IRType* type, const char* valueExpr);
 
+    void emitAtomicImageCoord(IRImageSubscript* operand);
+
     Dictionary<IRInst*, HashSet<IRFunc*>> m_referencingEntryPoints;
 
     RefPtr<GLSLExtensionTracker> m_glslExtensionTracker;
diff --git a/source/slang/slang-emit-hlsl.cpp b/source/slang/slang-emit-hlsl.cpp
index b45b4c575..ae87fd6d5 100644
--- a/source/slang/slang-emit-hlsl.cpp
+++ b/source/slang/slang-emit-hlsl.cpp
@@ -498,6 +498,10 @@ void HLSLSourceEmitter::emitEntryPointAttributesImpl(IRFunc* irFunc, IREntryPoin
 
 bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
 {
+    auto diagnoseFloatAtommic = [&]()
+        {
+            getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "floating point atomic operation");
+        };
     switch (inst->getOp())
     {
     case kIROp_AtomicLoad:
@@ -519,7 +523,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedExchange(");
+        m_writer->emit("InterlockedExchange");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -532,7 +537,10 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedCompareExchange(");
+        m_writer->emit("InterlockedCompareExchange");
+        if (inst->getDataType()->getOp() == kIROp_FloatType)
+            m_writer->emit("FloatBitwise");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -547,7 +555,12 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedAdd(");
+        if (inst->getDataType()->getOp() == kIROp_FloatType)
+        {
+            diagnoseFloatAtommic();
+        }
+        m_writer->emit("InterlockedAdd");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -560,7 +573,12 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedAdd(");
+        if (inst->getDataType()->getOp() == kIROp_FloatType)
+        {
+            diagnoseFloatAtommic();
+        }
+        m_writer->emit("InterlockedAdd");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", -(");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -573,7 +591,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedAnd(");
+        m_writer->emit("InterlockedAnd");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -586,7 +605,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedOr(");
+        m_writer->emit("InterlockedOr");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -599,7 +619,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedXor(");
+        m_writer->emit("InterlockedXor");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -612,7 +633,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedMin(");
+        m_writer->emit("InterlockedMin");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -625,7 +647,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedMax(");
+        m_writer->emit("InterlockedMax");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
@@ -638,7 +661,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedAdd(");
+        m_writer->emit("InterlockedAdd");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", 1, ");
         m_writer->emit(getName(inst));
@@ -649,7 +673,8 @@ bool HLSLSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     {
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n");
-        m_writer->emit("InterlockedAdd(");
+        m_writer->emit("InterlockedAdd");
+        m_writer->emit("(");
         emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
         m_writer->emit(", -1, ");
         m_writer->emit(getName(inst));
diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp
index 2d5a7d56b..abd4d670a 100644
--- a/source/slang/slang-emit-metal.cpp
+++ b/source/slang/slang-emit-metal.cpp
@@ -260,8 +260,118 @@ void MetalSourceEmitter::emitMemoryOrderOperand(IRInst* inst)
     }
 }
 
+static IRImageSubscript* isTextureAccess(IRInst* inst)
+{
+    return as<IRImageSubscript>(getRootAddr(inst->getOperand(0)));
+}
+
+void MetalSourceEmitter::emitAtomicImageCoord(IRImageSubscript* inst)
+{
+    auto resourceType = as<IRResourceTypeBase>(inst->getImage()->getDataType());
+    if (auto textureType = as<IRTextureType>(resourceType))
+    {
+        if (as<IRVectorType>(textureType->getElementType()))
+        {
+            getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "atomic operation on non-scalar texture");
+        }
+    }
+    bool isArray = getIntVal(resourceType->getIsArrayInst()) != 0;
+    if (isArray)
+    {
+        emitOperand(inst->getCoord(), getInfo(EmitOp::Postfix));
+        if (auto coordType = as<IRVectorType>(inst->getCoord()->getDataType()))
+        {
+            m_writer->emit(".");
+            const char* elements[] = { "x", "y", "z", "w" };
+            for (IRIntegerValue i = 0; i < getIntVal(coordType->getElementCount()) - 1; i++)
+                m_writer->emit(elements[Math::Min(3, (int)i)]);
+            m_writer->emit(", ");
+            emitOperand(inst->getCoord(), getInfo(EmitOp::Postfix));
+            m_writer->emit(".");
+            m_writer->emit(elements[Math::Min(3, (int)getIntVal(coordType->getElementCount()) - 1)]);
+        }
+        else
+        {
+            getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "invalid image coordinate for atomic operation");
+        }
+    }
+    else
+    {
+        emitOperand(inst->getCoord(), getInfo(EmitOp::General));
+    }
+}
+
+void MetalSourceEmitter::emitAtomicDestOperand(IRInst* inst)
+{
+    // If operand is already an atomic type, we can emit it
+    // as is.
+    auto ptrType = as<IRPtrTypeBase>(inst->getDataType());
+    if (ptrType && as<IRAtomicType>(ptrType->getValueType()))
+    {
+        emitOperand(inst, getInfo(EmitOp::General));
+        return;
+    }
+    // Otherwise, we need to emit a cast.
+    m_writer->emit("((atomic_");
+    emitType(inst->getDataType());
+    m_writer->emit(")(");
+    emitOperand(inst, getInfo(EmitOp::General));
+    m_writer->emit("))");
+}
+
+void MetalSourceEmitter::emitAtomicSrcOperand(bool isImage, IRInst* inst)
+{
+    if (!isImage)
+    {
+        emitOperand(inst, getInfo(EmitOp::General));
+        return;
+    }
+    // If we are emitting a source operand for an atomic image operation,
+    // we need to convert it into a 4-vector.
+    m_writer->emit("vec<");
+    emitType(inst->getDataType());
+    m_writer->emit(", 4>(");
+    emitOperand(inst, getInfo(EmitOp::General));
+    m_writer->emit(")");
+}
+
 bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
 {
+    auto emitAtomicOp = [&](const char* imageFunc, const char* bufferFunc)
+        {
+            emitInstResultDecl(inst);
+            bool isImageOp = false;
+            if (auto imageSubscript = isTextureAccess(inst))
+            {
+                emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix));
+                m_writer->emit(".");
+                m_writer->emit(imageFunc);
+                m_writer->emit("(");
+                emitAtomicImageCoord(imageSubscript);
+                isImageOp = true;
+            }
+            else
+            {
+                m_writer->emit(bufferFunc);
+                m_writer->emit("(");
+                emitAtomicDestOperand(inst->getOperand(0));
+            }
+            m_writer->emit(", ");
+            emitAtomicSrcOperand(isImageOp, inst->getOperand(1));
+            if (!isImageOp)
+            {
+                m_writer->emit(", ");
+                emitMemoryOrderOperand(inst->getOperand(inst->getOperandCount() - 1));
+            }
+            if (isImageOp)
+                m_writer->emit(").x;\n");
+            else
+                m_writer->emit(");\n");
+        };
+    auto diagnoseFloatAtommic = [&]()
+        {
+            getSink()->diagnose(inst, Diagnostics::unsupportedTargetIntrinsic, "floating point atomic operation");
+        };
     switch (inst->getOp())
     {
     case kIROp_discard:
@@ -287,160 +397,216 @@ bool MetalSourceEmitter::tryEmitInstStmtImpl(IRInst* inst)
     }
     case kIROp_AtomicLoad:
     {
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
         emitInstResultDecl(inst);
-        m_writer->emit("atomic_load_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(1));
-        m_writer->emit(");\n");
+        bool isImageOp = false;
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix));
+            m_writer->emit(".atomic_load(");
+            emitAtomicImageCoord(imageSubscript);
+            isImageOp = true;
+        }
+        else
+        {
+            m_writer->emit("atomic_load_explicit(");
+            emitAtomicDestOperand(inst->getOperand(0));
+        }
+        if (!isImageOp)
+        {
+            m_writer->emit(", ");
+            emitMemoryOrderOperand(inst->getOperand(1));
+        }
+        if (isImageOp)
+            m_writer->emit(").x;\n");
+        else
+            m_writer->emit(");\n");
         return true;
     }
     case kIROp_AtomicStore:
     {
-        m_writer->emit("atomic_store_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
+        bool isImageOp = false;
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix));
+            m_writer->emit(".atomic_store(");
+            emitAtomicImageCoord(imageSubscript);
+            isImageOp = true;
+        }
+        else
+        {
+            m_writer->emit("atomic_store_explicit(");
+            emitAtomicDestOperand(inst->getOperand(0));
+        }
         m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
+        emitAtomicSrcOperand(isImageOp, inst->getOperand(1));
+        if (!isImageOp)
+        {
+            m_writer->emit(", ");
+            emitMemoryOrderOperand(inst->getOperand(2));
+        }
         m_writer->emit(");\n");
         return true;
     }
     case kIROp_AtomicExchange:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_exchange_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
+        emitAtomicOp("atomic_exchange", "atomic_exchange_explicit");
         return true;
     }
     case kIROp_AtomicCompareExchange:
     {
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
+        bool isImageOp = false;
+        auto imageSubscript = isTextureAccess(inst);
+        isImageOp = (imageSubscript != nullptr);
+
         emitType(inst->getDataType(), getName(inst));
         m_writer->emit(";\n{\n");
-        emitType(inst->getDataType(), "_metal_cas_comparand");
+        if (isImageOp)
+            m_writer->emit("vec<");
+        emitType(inst->getDataType());
+        if (isImageOp)
+            m_writer->emit(", 4>");
+        m_writer->emit(" _metal_cas_comparand");
         m_writer->emit(" = ");
         emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
         m_writer->emit(";\n");
-
-        m_writer->emit(getName(inst));
-        m_writer->emit(" = atomic_compare_exchange_weak_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
+        if (imageSubscript)
+        {
+            emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix));
+            m_writer->emit(".atomic_compare_exchange_weak(");
+            emitAtomicImageCoord(imageSubscript);
+        }
+        else
+        {
+            m_writer->emit("atomic_compare_exchange_weak_explicit(");
+            emitAtomicDestOperand(inst->getOperand(0));
+        }
         m_writer->emit(", &_metal_cas_comparand, ");
-        emitOperand(inst->getOperand(2), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(3));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(4));
-        m_writer->emit(");\n}\n");
+        emitAtomicSrcOperand(isImageOp, inst->getOperand(2));
+        if (!isImageOp)
+        {
+            m_writer->emit(", ");
+            emitMemoryOrderOperand(inst->getOperand(3));
+            m_writer->emit(", ");
+            emitMemoryOrderOperand(inst->getOperand(4));
+        }
+        m_writer->emit(");\n");
+        m_writer->emit(getName(inst));
+        m_writer->emit(" = _metal_cas_comparand");
+        if (isImageOp)
+            m_writer->emit(".x");
+        m_writer->emit(";\n}\n");
         return true;
     }
     case kIROp_AtomicAdd:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_add_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
+        emitAtomicOp("atomic_fetch_add", "atomic_fetch_add_explicit");
         return true;
     }
     case kIROp_AtomicSub:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_sub_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
+        emitAtomicOp("atomic_fetch_sub", "atomic_fetch_sub_explicit");
         return true;
     }
     case kIROp_AtomicAnd:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_and_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        emitAtomicOp("atomic_fetch_and", "atomic_fetch_and_explicit");
         return true;
     }
     case kIROp_AtomicOr:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_or_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        emitAtomicOp("atomic_fetch_or", "atomic_fetch_or_explicit");
         return true;
     }
     case kIROp_AtomicXor:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_xor_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        emitAtomicOp("atomic_fetch_xor", "atomic_fetch_xor_explicit");
         return true;
     }
     case kIROp_AtomicMin:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_min_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
+        emitAtomicOp("atomic_fetch_min", "atomic_fetch_min_explicit");
         return true;
     }
     case kIROp_AtomicMax:
     {
-        emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_max_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitOperand(inst->getOperand(1), getInfo(EmitOp::General));
-        m_writer->emit(", ");
-        emitMemoryOrderOperand(inst->getOperand(2));
-        m_writer->emit(");\n");
+        if (isFloatingType(inst->getDataType()))
+            diagnoseFloatAtommic();
+
+        emitAtomicOp("atomic_fetch_max", "atomic_fetch_max_explicit");
         return true;
     }
     case kIROp_AtomicInc:
     {
         emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_add_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", 1, ");
-        emitMemoryOrderOperand(inst->getOperand(1));
-        m_writer->emit(");\n");
+        bool isImageOp = false;
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix));
+            m_writer->emit(".atomic_fetch_add(");
+            emitAtomicImageCoord(imageSubscript);
+            isImageOp = true;
+        }
+        else
+        {
+            m_writer->emit("atomic_fetch_add_explicit(");
+            emitAtomicDestOperand(inst->getOperand(0));
+        }
+        m_writer->emit(", 1");
+        if (!isImageOp)
+        {
+            m_writer->emit(", ");
+            emitMemoryOrderOperand(inst->getOperand(1));
+        }
+        if (isImageOp)
+            m_writer->emit(").x;\n");
+        else
+            m_writer->emit(");\n");
         return true;
     }
     case kIROp_AtomicDec:
     {
         emitInstResultDecl(inst);
-        m_writer->emit("atomic_fetch_sub_explicit(");
-        emitOperand(inst->getOperand(0), getInfo(EmitOp::General));
-        m_writer->emit(", 1, ");
-        emitMemoryOrderOperand(inst->getOperand(1));
-        m_writer->emit(");\n");
+        bool isImageOp = false;
+        if (auto imageSubscript = isTextureAccess(inst))
+        {
+            emitOperand(imageSubscript->getImage(), getInfo(EmitOp::Postfix));
+            m_writer->emit(".atomic_fetch_sub(");
+            emitAtomicImageCoord(imageSubscript);
+            isImageOp = true;
+        }
+        else
+        {
+            m_writer->emit("atomic_fetch_sub_explicit(");
+            emitAtomicDestOperand(inst->getOperand(0));
+        }
+        m_writer->emit(", 1");
+        if (!isImageOp)
+        {
+            m_writer->emit(", ");
+            emitMemoryOrderOperand(inst->getOperand(1));
+        }
+        if (isImageOp)
+            m_writer->emit(").x;\n");
+        else
+            m_writer->emit(");\n");
         return true;
     }
     }
diff --git a/source/slang/slang-emit-metal.h b/source/slang/slang-emit-metal.h
index 8e33eddef..e0fe1f1c8 100644
--- a/source/slang/slang-emit-metal.h
+++ b/source/slang/slang-emit-metal.h
@@ -79,6 +79,11 @@ protected:
     void _emitStageAccessSemantic(IRStageAccessDecoration* decoration, const char* name);
     bool _emitUserSemantic(UnownedStringSlice semanticName, IRIntegerValue semanticIndex);
     bool maybeEmitSystemSemantic(IRInst* inst);
+
+    void emitAtomicImageCoord(IRImageSubscript* subscript);
+    void emitAtomicDestOperand(IRInst* operand);
+    void emitAtomicSrcOperand(bool isImage, IRInst* operand);
+    void emitAtomicSemanticOperand(IRInst* inst);
 };
 
 }
diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp
index 0f123b8fd..62819e6d5 100644
--- a/source/slang/slang-emit-spirv.cpp
+++ b/source/slang/slang-emit-spirv.cpp
@@ -2929,11 +2929,11 @@ struct SPIRVEmitContext
 
     void ensureAtomicCapability(IRInst* atomicInst, SpvOp op)
     {
+        auto typeOp = atomicInst->getDataType()->getOp();
         switch (op)
         {
         case SpvOpAtomicFAddEXT:
         {
-            auto typeOp = getVectorElementType(atomicInst->getDataType())->getOp();
             switch (typeOp)
             {
             case kIROp_FloatType:
@@ -2948,13 +2948,19 @@ struct SPIRVEmitContext
                 ensureExtensionDeclaration(toSlice("SPV_EXT_shader_atomic_float16_add"));
                 requireSPIRVCapability(SpvCapabilityAtomicFloat16AddEXT);
                 break;
+            case kIROp_VectorType:
+                if (as<IRVectorType>(atomicInst->getDataType())->getElementType()->getOp() == kIROp_HalfType)
+                {
+                    ensureExtensionDeclaration(toSlice("VK_NV_shader_atomic_float16_vector"));
+                    requireSPIRVCapability(SpvCapabilityAtomicFloat16VectorNV);
+                }
+                break;
             }
         }
         break;
         case SpvOpAtomicFMinEXT:
         case SpvOpAtomicFMaxEXT:
         {
-            auto typeOp = getVectorElementType(atomicInst->getDataType())->getOp();
             switch (typeOp)
             {
             case kIROp_FloatType:
@@ -2969,10 +2975,24 @@ struct SPIRVEmitContext
                 ensureExtensionDeclaration(toSlice("SPV_EXT_shader_atomic_float_min_max"));
                 requireSPIRVCapability(SpvCapabilityAtomicFloat16MinMaxEXT);
                 break;
+            case kIROp_VectorType:
+                if (as<IRVectorType>(atomicInst->getDataType())->getElementType()->getOp() == kIROp_HalfType)
+                {
+                    ensureExtensionDeclaration(toSlice("VK_NV_shader_atomic_float16_vector"));
+                    requireSPIRVCapability(SpvCapabilityAtomicFloat16VectorNV);
+                }
+                break;
             }
         }
         break;
         }
+        switch (typeOp)
+        {
+        case kIROp_UInt64Type:
+        case kIROp_Int64Type:
+            requireSPIRVCapability(SpvCapabilityInt64Atomics);
+            break;
+        }
     }
 
     // The instructions that appear inside the basic blocks of
@@ -3321,6 +3341,7 @@ struct SPIRVEmitContext
                 const auto memoryScope = emitIntConstant(IRIntegerValue{SpvScopeDevice}, builder.getUIntType());
                 const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(1));
                 result = emitOpAtomicIIncrement(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics);
+                ensureAtomicCapability(inst, SpvOpAtomicIIncrement);
             }
             break;
         case kIROp_AtomicDec:
@@ -3329,6 +3350,7 @@ struct SPIRVEmitContext
                 const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType());
                 const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(1));
                 result = emitOpAtomicIDecrement(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics);
+                ensureAtomicCapability(inst, SpvOpAtomicIDecrement);
             }
             break;
         case kIROp_AtomicLoad:
@@ -3337,6 +3359,7 @@ struct SPIRVEmitContext
                 const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType());
                 const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(1));
                 result = emitOpAtomicLoad(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics);
+                ensureAtomicCapability(inst, SpvOpAtomicLoad);
             }
             break;
         case kIROp_AtomicStore:
@@ -3345,6 +3368,7 @@ struct SPIRVEmitContext
                 const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType());
                 const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(2));
                 result = emitOpAtomicStore(parent, inst, inst->getOperand(0), memoryScope, memorySemantics, inst->getOperand(1));
+                ensureAtomicCapability(inst, SpvOpAtomicStore);
             }
             break;
         case kIROp_AtomicExchange:
@@ -3353,6 +3377,7 @@ struct SPIRVEmitContext
                 const auto memoryScope = emitIntConstant(IRIntegerValue{ SpvScopeDevice }, builder.getUIntType());
                 const auto memorySemantics = emitMemorySemanticMask(inst->getOperand(2));
                 result = emitOpAtomicExchange(parent, inst, inst->getFullType(), inst->getOperand(0), memoryScope, memorySemantics, inst->getOperand(1));
+                ensureAtomicCapability(inst, SpvOpAtomicExchange);
             }
             break;
         case kIROp_AtomicCompareExchange:
@@ -3365,6 +3390,7 @@ struct SPIRVEmitContext
                     parent, inst, inst->getFullType(), inst->getOperand(0),
                     memoryScope, memorySemanticsEqual, memorySemanticsUnequal,
                     inst->getOperand(2), inst->getOperand(1));
+                ensureAtomicCapability(inst, SpvOpAtomicCompareExchange);
             }
             break;
         case kIROp_AtomicAdd:
diff --git a/source/slang/slang-emit.cpp b/source/slang/slang-emit.cpp
index c9319a13b..2206d29cf 100644
--- a/source/slang/slang-emit.cpp
+++ b/source/slang/slang-emit.cpp
@@ -53,9 +53,7 @@
 #include "slang-ir-lower-l-value-cast.h"
 #include "slang-ir-lower-reinterpret.h"
 #include "slang-ir-loop-unroll.h"
-#include "slang-ir-legalize-extract-from-texture-access.h"
 #include "slang-ir-legalize-image-subscript.h"
-#include "slang-ir-legalize-is-texture-access.h"
 #include "slang-ir-legalize-vector-types.h"
 #include "slang-ir-metadata.h"
 #include "slang-ir-optix-entry-point-uniforms.h"
@@ -1058,9 +1056,6 @@ Result linkAndOptimizeIR(
 
     legalizeVectorTypes(irModule, sink);
 
-    // Legalize `__isTextureAccess` and related.
-    legalizeIsTextureAccess(irModule, sink);
-
     // Once specialization and type legalization have been performed,
     // we should perform some of our basic optimization steps again,
     // to see if we can clean up any temporaries created by legalization.
@@ -1335,8 +1330,6 @@ Result linkAndOptimizeIR(
     // Create aliases for all dynamic resource parameters.
     if(requiredLoweringPassSet.dynamicResource && isKhronosTarget(targetRequest))
         legalizeDynamicResourcesForGLSL(codeGenContext, irModule);
-    
-    legalizeExtractFromTextureAccess(irModule);
 
     // Legalize `ImageSubscript` loads.
     switch (target)
diff --git a/source/slang/slang-intrinsic-expand.cpp b/source/slang/slang-intrinsic-expand.cpp
index 7cde70777..aabc193dd 100644
--- a/source/slang/slang-intrinsic-expand.cpp
+++ b/source/slang/slang-intrinsic-expand.cpp
@@ -653,112 +653,6 @@ const char* IntrinsicExpandContext::_emitSpecial(const char* cursor)
             }
         }
         break;
-
-        case 'a':
-        {
-            // We have an operation that needs to lower to either
-            // `atomic*` or `imageAtomic*` for GLSL, depending on
-            // whether its first operand is a subscript into an
-            // array. This `$a` is the first `a` in `atomic`,
-            // so we will replace it accordingly.
-            //
-            // TODO: This distinction should be made earlier,
-            // with the front-end picking the right overload
-            // based on the "address space" of the argument.
-
-            Index argIndex = 0;
-            SLANG_RELEASE_ASSERT(m_argCount > argIndex);
-
-            auto arg = m_args[argIndex].get();
-            if (arg->getOp() == kIROp_ImageSubscript)
-            {
-                m_writer->emit("imageA");
-            }
-            else
-            {
-                m_writer->emit("a");
-            }
-        }
-        break;
-
-        case 'A':
-        {
-            // We have an operand that represents the destination
-            // of an atomic operation in GLSL, and it should
-            // be lowered based on whether it is an ordinary l-value,
-            // or an image subscript. In the image subscript case
-            // this operand will turn into multiple arguments
-            // to the `imageAtomic*` function.
-            //
-
-            Index argIndex = 0;
-            SLANG_RELEASE_ASSERT(m_argCount > argIndex);
-
-            auto arg = m_args[argIndex].get();
-            if (arg->getOp() == kIROp_ImageSubscript)
-            {
-                if (m_emitter->getSourceLanguage() == SourceLanguage::GLSL)
-                {
-                    // TODO: we don't handle the multisample
-                    // case correctly here, where the last
-                    // component of the image coordinate needs
-                    // to be broken out into its own argument.
-                    //
-                    m_writer->emit("(");
-                    m_emitter->emitOperand(arg->getOperand(0), getInfo(EmitOp::General));
-                    m_writer->emit("), ");
-
-                    // The coordinate argument will have been computed
-                    // as a `vector<uint, N>` because that is how the
-                    // HLSL image subscript operations are defined.
-                    // In contrast, the GLSL `imageAtomic*` operations
-                    // expect `vector<int, N>` coordinates, so we
-                    // will hackily insert the conversion here as
-                    // part of the intrinsic op.
-                    //
-                    auto coords = arg->getOperand(1);
-                    auto coordsType = coords->getDataType();
-
-                    auto coordsVecType = as<IRVectorType>(coordsType);
-                    IRIntegerValue elementCount = 1;
-                    if (coordsVecType)
-                    {
-                        coordsType = coordsVecType->getElementType();
-                        elementCount = getIntVal(coordsVecType->getElementCount());
-                    }
-
-                    SLANG_ASSERT(coordsType->getOp() == kIROp_UIntType);
-
-                    if (elementCount > 1)
-                    {
-                        m_writer->emit("ivec");
-                        m_writer->emit(elementCount);
-                    }
-                    else
-                    {
-                        m_writer->emit("int");
-                    }
-
-                    m_writer->emit("(");
-                    m_emitter->emitOperand(arg->getOperand(1), getInfo(EmitOp::General));
-                    m_writer->emit(")");
-                }
-                else
-                {
-                    m_writer->emit("(");
-                    m_emitter->emitOperand(arg, getInfo(EmitOp::General));
-                    m_writer->emit(")");
-                }
-            }
-            else
-            {
-                m_writer->emit("(");
-                m_emitter->emitOperand(arg, getInfo(EmitOp::General));
-                m_writer->emit(")");
-            }
-        }
-        break;
-
         case 'P':
             // Type-based prefix as used for CUDA and C++ targets
         {
diff --git a/source/slang/slang-ir-legalize-extract-from-texture-access.cpp b/source/slang/slang-ir-legalize-extract-from-texture-access.cpp
deleted file mode 100644
index de1e244a8..000000000
--- a/source/slang/slang-ir-legalize-extract-from-texture-access.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#include "slang-ir-legalize-extract-from-texture-access.h"
-
-#include "slang-ir.h"
-#include "slang-ir-insts.h"
-#include "slang-ir-util.h"
-#include "slang-ir-clone.h"
-#include "slang-ir-specialize-address-space.h"
-#include "slang-parameter-binding.h"
-#include "slang-ir-legalize-image-subscript.h"
-#include "slang-ir-legalize-varying-params.h"
-#include "slang-ir-simplify-cfg.h"
-
-namespace Slang
-{
-    void legalizeExtractTextureFromTextureAccess(IRBuilder& builder, IRInst* inst)
-    {
-        SLANG_ASSERT(inst);
-
-        builder.setInsertBefore(inst);
-        IRImageSubscript* imageSubscript = as<IRImageSubscript>(getRootAddr(inst->getOperand(0)));
-        SLANG_ASSERT(imageSubscript);
-        SLANG_ASSERT(imageSubscript->getImage());
-        inst->replaceUsesWith(imageSubscript->getImage());
-        inst->removeAndDeallocate();
-        // Ensure we are done processing the imageSubscript before we remove it
-        if (!imageSubscript->hasUses())
-            imageSubscript->removeAndDeallocate();
-    }
-
-    void legalizeExtractArrayCoordFromTextureAccess(IRBuilder& builder, IRInst* inst)
-    {
-        SLANG_ASSERT(inst);
-
-        builder.setInsertBefore(inst);
-        IRImageSubscript* imageSubscript = as<IRImageSubscript>(getRootAddr(inst->getOperand(0)));
-        SLANG_ASSERT(imageSubscript);
-        SLANG_ASSERT(imageSubscript->getImage());
-        
-        auto image = as<IRTextureType>(imageSubscript->getImage()->getDataType());
-        IRInst* coord = imageSubscript->getCoord();
-        if(image->isArray())
-        {
-            // Extract final element which is 'ArrayCoord'
-            IRVectorType* coordType = as<IRVectorType>(imageSubscript->getCoord()->getDataType());
-            SLANG_ASSERT(coordType);
-            auto coordSize = getIRVectorElementSize(coordType);
-
-            IRType* newArrayCoordType = coordType->getElementType();
-            auto arrayCoordLocation = coordSize - 1;
-            List<UInt> swizzleIndicies = { (UInt)arrayCoordLocation };
-            
-            coord = builder.emitSwizzle(newArrayCoordType, coord, 1, swizzleIndicies.getBuffer());
-        }
-        else
-            coord = builder.getIntValue(builder.getUIntType(), 0);
-
-
-        inst->replaceUsesWith(coord);
-        inst->removeAndDeallocate();
-        // Ensure we are done processing the imageSubscript completly before we remove it
-        if (!imageSubscript->hasUses())
-            imageSubscript->removeAndDeallocate();
-    }
-
-    void legalizeExtractCoordFromTextureAccess(IRBuilder& builder, IRInst* inst)
-    {
-        SLANG_ASSERT(inst);
-
-        builder.setInsertBefore(inst);
-        IRImageSubscript* imageSubscript = as<IRImageSubscript>(getRootAddr(inst->getOperand(0)));
-        SLANG_ASSERT(imageSubscript);
-        SLANG_ASSERT(imageSubscript->getImage());
-        
-        auto image = as<IRTextureType>(imageSubscript->getImage()->getDataType());
-        IRInst* coord = imageSubscript->getCoord();
-        if(image->isArray())
-        {
-            // Extract all but final element which is 'ArrayCoord'
-            IRVectorType* coordType = as<IRVectorType>(imageSubscript->getCoord()->getDataType());
-            auto coordSize = getIRVectorElementSize(coordType);
-            SLANG_ASSERT(coordType);
-            
-            IRType* newCoordType = nullptr;
-            auto newCoordSize = coordSize - 1;
-            if(newCoordSize != 1)
-                newCoordType = builder.getVectorType(coordType->getElementType(), newCoordSize);
-            else
-                newCoordType = coordType->getElementType();
-            List<UInt> swizzleIndicies = {1, 2, 3, 4};
-            
-            coord = builder.emitSwizzle(newCoordType, coord, newCoordSize, swizzleIndicies.getBuffer());
-        }
-
-        inst->replaceUsesWith(coord);
-        inst->removeAndDeallocate();
-        // Ensure we are done processing the imageSubscript completly before we remove it
-        if (!imageSubscript->hasUses())
-            imageSubscript->removeAndDeallocate();
-    }
-
-    void legalizeExtractFromTextureAccess(IRModule* module)
-    {
-        IRBuilder builder(module);
-        for (auto globalInst : module->getModuleInst()->getChildren())
-        {
-            auto func = as<IRFunc>(globalInst);
-            if (!func)
-                continue;
-            for (auto block : func->getBlocks())
-            {
-                auto inst = block->getFirstInst();
-                IRInst* next;
-                for ( ; inst; inst = next)
-                {
-                    next = inst->getNextInst();
-                    switch (inst->getOp())
-                    {
-                    case kIROp_ExtractArrayCoordFromTextureAccess:
-                        if (as<IRImageSubscript>(getRootAddr(inst->getOperand(0))))
-                            legalizeExtractArrayCoordFromTextureAccess(builder, inst);
-                        continue;
-                    case kIROp_ExtractCoordFromTextureAccess:
-                        if (as<IRImageSubscript>(getRootAddr(inst->getOperand(0))))
-                            legalizeExtractCoordFromTextureAccess(builder, inst);
-                        continue;
-                    case kIROp_ExtractTextureFromTextureAccess:
-                        if (as<IRImageSubscript>(getRootAddr(inst->getOperand(0))))
-                            legalizeExtractTextureFromTextureAccess(builder, inst);
-                        continue;
-                    }
-                }   
-            }
-        }
-    }
-}
-
diff --git a/source/slang/slang-ir-legalize-extract-from-texture-access.h b/source/slang/slang-ir-legalize-extract-from-texture-access.h
deleted file mode 100644
index 016c86def..000000000
--- a/source/slang/slang-ir-legalize-extract-from-texture-access.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include "slang-ir.h"
-#include "slang-compiler.h"
-
-namespace Slang
-{
-    class DiagnosticSink;
-
-    void legalizeExtractFromTextureAccess(IRModule* module);
-}
diff --git a/source/slang/slang-ir-legalize-is-texture-access.cpp b/source/slang/slang-ir-legalize-is-texture-access.cpp
deleted file mode 100644
index b9a0a7772..000000000
--- a/source/slang/slang-ir-legalize-is-texture-access.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-#include "slang-ir-legalize-is-texture-access.h"
-
-#include "slang-ir.h"
-#include "slang-ir-insts.h"
-#include "slang-ir-util.h"
-#include "slang-ir-clone.h"
-#include "slang-ir-specialize-address-space.h"
-#include "slang-parameter-binding.h"
-#include "slang-ir-legalize-image-subscript.h"
-#include "slang-ir-legalize-varying-params.h"
-#include "slang-ir-sccp.h"
-
-namespace Slang
-{
-    IRImageSubscript* getTextureAccess(IRInst* inst)
-    {
-        return as<IRImageSubscript>(getRootAddr(inst->getOperand(0)));
-    }
-
-    void legalizeIsTextureAccess(IRModule* module, DiagnosticSink* sink)
-    {
-        HashSet<IRFunc*> functionsToSCCP;
-        IRBuilder builder(module);
-        for (auto globalInst : module->getModuleInst()->getChildren())
-        {
-            auto func = as<IRFunc>(globalInst);
-            if (!func)
-                continue;
-            for (auto block : func->getBlocks())
-            {
-                auto inst = block->getFirstInst();
-                IRInst* next;
-                for ( ; inst; inst = next)
-                {
-                    next = inst->getNextInst();
-                    switch (inst->getOp())
-                    {
-                    case kIROp_IsTextureAccess:
-                        if (getTextureAccess(inst))
-                            inst->replaceUsesWith(builder.getBoolValue(true));
-                        else
-                            inst->replaceUsesWith(builder.getBoolValue(false));
-                        inst->removeAndDeallocate();
-                        functionsToSCCP.add(func);
-                        continue;
-                    case kIROp_IsTextureArrayAccess:
-                    {
-                        auto textureAccess = getTextureAccess(inst);
-                        if (textureAccess && as<IRTextureType>(textureAccess->getImage()->getDataType())->isArray())
-                            inst->replaceUsesWith(builder.getBoolValue(true));
-                        else
-                            inst->replaceUsesWith(builder.getBoolValue(false));
-                        inst->removeAndDeallocate();
-                        functionsToSCCP.add(func);
-                        continue;
-                    }
-                    case kIROp_IsTextureScalarAccess:
-                    {
-                        auto textureAccess = getTextureAccess(inst);
-                        if (textureAccess && !as<IRVectorType>(as<IRTextureType>(textureAccess->getImage()->getDataType())->getElementType()))
-                            inst->replaceUsesWith(builder.getBoolValue(true));
-                        else
-                            inst->replaceUsesWith(builder.getBoolValue(false));
-                        inst->removeAndDeallocate();
-                        functionsToSCCP.add(func);
-                        continue;
-                    }
-                    }
-                }   
-            }
-        }
-        // Requires a SCCP to ensure Slang does not evaluate 'IRTextureType' code path
-        // and unresolved 'isTextureAccess' operations for when 'inst' is not a
-        // 'IRTextureType'/`TextureAccessor`
-        for (auto func : functionsToSCCP)
-            applySparseConditionalConstantPropagation(func, sink);
-    }
-}
-
diff --git a/source/slang/slang-ir-legalize-is-texture-access.h b/source/slang/slang-ir-legalize-is-texture-access.h
deleted file mode 100644
index 9b9e1cca0..000000000
--- a/source/slang/slang-ir-legalize-is-texture-access.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include "slang-ir.h"
-#include "slang-compiler.h"
-
-namespace Slang
-{
-    class DiagnosticSink;
-
-    void legalizeIsTextureAccess(IRModule* module, DiagnosticSink* sink);
-}
diff --git a/source/slang/slang-ir-use-uninitialized-values.cpp b/source/slang/slang-ir-use-uninitialized-values.cpp
index 98fd9841a..fea55de8d 100644
--- a/source/slang/slang-ir-use-uninitialized-values.cpp
+++ b/source/slang/slang-ir-use-uninitialized-values.cpp
@@ -315,8 +315,11 @@ namespace Slang
         case kIROp_Unmodified:
             return Store;
 
-        // ... and the rest will load/use them
         default:
+            // Default case is that if the instruction is a pointer, it
+            // is considered a store, otherwise a load.
+            if (as<IRPtrTypeBase>(user->getDataType()))
+                return Store;
             return Load;
         }
     }
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index d0dcfd4fb..e0998779a 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -5092,7 +5092,7 @@ namespace Slang
         auto inst = createInst<IRAtomicStore>(
             this,
             kIROp_AtomicStore,
-            nullptr,
+            getVoidType(),
             dstPtr,
             srcVal,
             memoryOrder);
diff --git a/tests/bugs/gh-3997.slang b/tests/bugs/gh-3997.slang
index 8c75da426..d42e65e39 100644
--- a/tests/bugs/gh-3997.slang
+++ b/tests/bugs/gh-3997.slang
@@ -10,7 +10,7 @@ float atomicAdd(__ref float value, float amount)
         __requirePrelude("#include <atomic>");
         __intrinsic_asm "std::atomic_ref(*$0).fetch_add($1)";
     case spirv:
-        return __atomicAdd(value, amount);
+        return __atomic_add(value, amount);
     }
 }
 
diff --git a/tests/compute/atomics-invalid-dest-type.slang b/tests/compute/atomics-invalid-dest-type.slang
index 864debaee..5ae03a5c7 100644
--- a/tests/compute/atomics-invalid-dest-type.slang
+++ b/tests/compute/atomics-invalid-dest-type.slang
@@ -1,11 +1,8 @@
 // atomics-buffer.slang
 
-//TEST:SIMPLE(filecheck=CHECK): -target spirv -stage compute -entry computeMain
-//TEST:SIMPLE(filecheck=CHECK): -target hlsl -stage compute -entry computeMain
-//TEST:SIMPLE(filecheck=CHECK): -target glsl -stage compute -entry computeMain
 //TEST:SIMPLE(filecheck=CHECK): -target metal -stage compute -entry computeMain
 
-//CHECK: Atomic must be applied to a scalar texture or non-texture
+//CHECK: atomic operation on non-scalar texture
 
 RWBuffer<uint2> outputBuffer;
 
diff --git a/tests/compute/nonuniformres-atomic.slang b/tests/compute/nonuniformres-atomic.slang
index 95ae502dc..10dd30cb0 100644
--- a/tests/compute/nonuniformres-atomic.slang
+++ b/tests/compute/nonuniformres-atomic.slang
@@ -9,7 +9,7 @@ RWTexture2D<uint> texArray[2];
 void main( uint2 dispatchThreadID : SV_DispatchThreadID, uint2 groupThreadID : SV_GroupThreadID )
 {
 
-    // CHECK0: imageAtomicAdd((texArray_{{.*}}[nonuniformEXT({{.*}})]
+    // CHECK0: {{.*}}imageAtomicAdd(texArray_{{.*}}[nonuniformEXT({{.*}})]
 
     // CHECK1: InterlockedAdd(texArray_{{.*}}[NonUniformResourceIndex({{.*}})]
     
diff --git a/tests/hlsl-intrinsic/texture/float-atomics.slang b/tests/hlsl-intrinsic/texture/float-atomics.slang
index 02cb5570c..913380416 100644
--- a/tests/hlsl-intrinsic/texture/float-atomics.slang
+++ b/tests/hlsl-intrinsic/texture/float-atomics.slang
@@ -24,6 +24,6 @@ void computeMain(uint3 tid : SV_DispatchThreadID)
     AllMemoryBarrier();
 
     // CHECK: 4.0
-    outputBuffer[0] = t[uint2(1, 0)];
+    outputBuffer[0] = t[uint2(1, 0)] + originalValue;
 
 }
diff --git a/tests/metal/atomic-byteaddressbuffer.slang b/tests/metal/atomic-byteaddressbuffer.slang
new file mode 100644
index 000000000..677f80dbf
--- /dev/null
+++ b/tests/metal/atomic-byteaddressbuffer.slang
@@ -0,0 +1,57 @@
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -compute -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-cuda -compute -shaderobj -output-using-type
+//TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0]):name=uintBuffer
+RWByteAddressBuffer uintBuffer;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(1,1,1)]
+void computeMain()
+{
+    uintBuffer.InterlockedAdd(0, 1);
+    int oldValue;
+    //LIB: call {{.*}}.atomic.global.add.u.i32
+    uintBuffer.InterlockedAdd(0, 1, oldValue);
+    // CHK: 1
+    outputBuffer[0] = oldValue;
+
+    uintBuffer.InterlockedAdd(0, 1, oldValue);
+    // CHK: 2
+    outputBuffer[1] = (int)oldValue;
+
+    uintBuffer.InterlockedCompareExchange(0, 3, 4, oldValue);
+    // CHK: 3
+    outputBuffer[2] = (int)oldValue;
+
+    uintBuffer.InterlockedOr(0, 3, oldValue);
+    // CHK: 4
+    outputBuffer[3] = oldValue; // 4
+
+    uintBuffer.InterlockedExchange(0, 4, oldValue);
+    // CHK: 7
+    outputBuffer[4] = oldValue; // 7
+
+    uintBuffer.InterlockedMin(0, 3, oldValue);
+    // CHK: 4
+    outputBuffer[5] = oldValue; // 4
+
+    uintBuffer.InterlockedMax(0, 4, oldValue);
+    // CHK: 3
+    outputBuffer[6] = oldValue; // 3
+
+    uintBuffer.InterlockedAnd(0, 7, oldValue);
+    // CHK: 4
+    outputBuffer[7] = oldValue; // 4
+
+    uintBuffer.InterlockedXor(0, 7, oldValue);
+    // CHK: 4
+    outputBuffer[8] = oldValue; // 4
+
+    // CHK: 3
+    outputBuffer[9] = uintBuffer.Load(0);
+
+}
+\ No newline at end of file
diff --git a/tests/metal/atomic-intrinsics.slang b/tests/metal/atomic-intrinsics.slang
index 5d47db913..afa0e5365 100644
--- a/tests/metal/atomic-intrinsics.slang
+++ b/tests/metal/atomic-intrinsics.slang
@@ -1,8 +1,7 @@
 //TEST:SIMPLE(filecheck=MTL):-target metal -entry computeMain -stage compute -DMETAL
 //TEST:SIMPLE(filecheck=LIB):-target metallib -entry computeMain -stage compute -DMETAL
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -output-using-type
-//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-directly -compute -shaderobj -output-using-type
-//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -emit-spirv-via-glsl -compute -shaderobj -output-using-type
+//TEST(compute, vulkan):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-vk -compute -shaderobj -output-using-type
 
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj -output-using-type
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type
@@ -36,22 +35,22 @@ void computeMain(uint groupIndex : SV_GroupIndex)
     float val = 0.0f;
 
     // InterlockedAdd
-    //MTL: atomic_uint threadgroup* {{.*}}shareMemUI
+    //MTL: atomic_uint threadgroup*{{.*}}shareMemUI
     //LIB: call {{.*}}.atomic.local.add.u.i32
     InterlockedAdd(shareMemUI[idx], uint(1));
     val += shareMemUI[idx];
 
-    //MTL: atomic_int threadgroup* {{.*}}shareMemI
+    //MTL: atomic_int threadgroup*{{.*}}shareMemI
     //LIB: call {{.*}}.atomic.local.add.s.i32
     InterlockedAdd(shareMemI[idx],  2);
     val += shareMemI[idx];
 
-    //MTL: atomic_uint device* {{.*}}uintBuffer
+    //MTL: atomic_uint device*{{.*}}uintBuffer
     //LIB: call {{.*}}.atomic.global.add.u.i32
     InterlockedAdd(uintBuffer[idx], 1);
     val += uintBuffer[idx];
 
-    //MTL: atomic_int device* {{.*}}intBuffer
+    //MTL: atomic_int device*{{.*}}intBuffer
     //LIB: call {{.*}}.atomic.global.add.s.i32
     InterlockedAdd(intBuffer[idx], 2);
     val += intBuffer[idx];
diff --git a/tests/metal/atomic-texture-buffer.slang b/tests/metal/atomic-texture-buffer.slang
index 3e4eda94b..1db156364 100644
--- a/tests/metal/atomic-texture-buffer.slang
+++ b/tests/metal/atomic-texture-buffer.slang
@@ -2,7 +2,7 @@
 //TEST:SIMPLE(filecheck=METAL_FLOAT): -target metal -stage compute -entry computeMain -DFLOAT
 //TEST:SIMPLE(filecheck=METALLIB): -target metallib -stage compute -entry computeMain
 
-// METAL_FLOAT: 'float' atomic texture operations are disallowed with Metal target's
+// METAL_FLOAT: floating point atomic operation
 
 //METALLIB: @computeMain
 
diff --git a/tests/slang-extension/atomic-int64-byte-address-buffer.slang b/tests/slang-extension/atomic-int64-byte-address-buffer.slang
index 9a7ae3b61..61e38069d 100644
--- a/tests/slang-extension/atomic-int64-byte-address-buffer.slang
+++ b/tests/slang-extension/atomic-int64-byte-address-buffer.slang
@@ -5,7 +5,7 @@
 // No support for int64_t on fxc - we need SM6.0 and dxil
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
diff --git a/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang b/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang
index 4ab67df8e..2fce9788a 100644
--- a/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang
+++ b/tests/slang-extension/atomic-min-max-u64-byte-address-buffer.slang
@@ -5,7 +5,7 @@
 // No support for int64_t on fxc - we need SM6.0 and dxil
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj
 // For some reason this doesn't work correctly on CUDA? That it behaves as if always does Min. Min and Max do appropriate 
 // things tho, because if I force the condition I do get the right answer
diff --git a/tests/slang-extension/cas-int64-byte-address-buffer.slang b/tests/slang-extension/cas-int64-byte-address-buffer.slang
index 873f6ab4b..2d3189215 100644
--- a/tests/slang-extension/cas-int64-byte-address-buffer.slang
+++ b/tests/slang-extension/cas-int64-byte-address-buffer.slang
@@ -5,7 +5,7 @@
 // No support for int64_t on fxc - we need SM6.0 and dxil
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
diff --git a/tests/slang-extension/exchange-int64-byte-address-buffer.slang b/tests/slang-extension/exchange-int64-byte-address-buffer.slang
index 84654ab80..a6c1277ac 100644
--- a/tests/slang-extension/exchange-int64-byte-address-buffer.slang
+++ b/tests/slang-extension/exchange-int64-byte-address-buffer.slang
@@ -2,10 +2,7 @@
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
 // No support for int64_t on DX11
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj
-// No support for int64_t on fxc - we need SM6.0 and dxil
-// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -nvapi-slot u0 -shaderobj
-//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -nvapi-slot u0 -compile-arg -O2 -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -render-features atomic-int64 -compile-arg -O2 -shaderobj
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -render-features atomic-int64 -shaderobj
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
 
diff --git a/tests/spirv/ref-this.slang b/tests/spirv/ref-this.slang
index 5eaa7f3a1..de4263975 100644
--- a/tests/spirv/ref-this.slang
+++ b/tests/spirv/ref-this.slang
@@ -1,7 +1,7 @@
 //TEST:SIMPLE(filecheck=CHECK): -target spirv
 
 // CHECK: %[[PTR:[0-9a-zA-Z_]+]] = OpAccessChain %_ptr_PhysicalStorageBuffer_uint %{{.*}} %int_0
-// CHECK: %original = OpAtomicIAdd %uint %[[PTR]] %uint_1 %uint_0 %uint_1
+// CHECK: %{{.*}} = OpAtomicIAdd %uint %[[PTR]] %uint_1 %uint_0 %uint_1
 
 struct Buf
 {
author	Yong He <yonghe@outlook.com>	2024-10-17 20:14:22 -0700
committer	GitHub <noreply@github.com>	2024-10-17 20:14:22 -0700
commit	a618b8c5e249b0f20e6c0c95f9da1b5cbfdbf08b (patch)
tree	d583c373d574a265fefe7f288a96c4b382e259b8
parent	11e1ecafa09396a3559fe245d729b40ce4f25d52 (diff)