22 files changed, 227 insertions, 299 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c9f3fb533..a671a3dc4 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -17309,8 +17309,8 @@ uint dot4add_u8packed(uint x, uint y, uint acc)
             result:$$uint = OpIAdd %dotResult $acc;
         };
     default:
-        uint4 vecX = unpack_u8u32(uint8_t4_packed(x));
-        uint4 vecY = unpack_u8u32(uint8_t4_packed(y));
+        uint4 vecX = unpackUint4x8ToUint32(x);
+        uint4 vecY = unpackUint4x8ToUint32(y);
         return dot(vecX, vecY) + acc;
     }
 }
@@ -17337,8 +17337,8 @@ int dot4add_i8packed(uint x, uint y, int acc)
             result:$$int = OpIAdd %dotResult $acc;
         };
     default:
-        int4 vecX = unpack_s8s32(int8_t4_packed(x));
-        int4 vecY = unpack_s8s32(int8_t4_packed(y));
+        int4 vecX = unpackInt4x8ToInt32(x);
+        int4 vecY = unpackInt4x8ToInt32(y);
         return dot(vecX, vecY) + acc;
     }
 }
@@ -24035,383 +24035,426 @@ T workgroupUniformLoad<T>(__ref T v)
 }
 
 //
-// Pack/Unpack Math Intrinsics
+// HLSL Pack/Unpack Math Intrinsics
 //
 // These were introduced in SM 6.6 but requirements are dropped to SM 5.0 here
 // to expose these intrinsics on targets that do not have SM 6.6 features.
 //
 
-//@hidden:
+//@public:
+
+typealias uint8_t4_packed = uint;
+typealias int8_t4_packed = uint;
+
+/// Unpack 4 signed 8-bit values into a vector of 16 bit integers.
 [__readNone]
 [ForceInline]
-uint16_t __lsb_as_u16(uint32_t val)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int16_t4 unpack_s8s16(int8_t4_packed packed)
 {
-    return uint16_t(val & 0xFFU);
+    return unpackInt4x8ToInt16(packed);
 }
 
-//@hidden:
+/// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers.
 [__readNone]
 [ForceInline]
-uint32_t __lsb_as_u32(uint32_t val)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint16_t4 unpack_u8u16(uint8_t4_packed packed)
 {
-    return (val & 0xFFU);
+    return unpackUint4x8ToUint16(packed);
 }
 
-//@hidden:
+/// Unpack 4 signed 8-bit values into a vector of 32 bit integers.
 [__readNone]
 [ForceInline]
-int8_t __lsb_as_s8(uint32_t val)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int32_t4 unpack_s8s32(int8_t4_packed packed)
 {
-    return int8_t(val & 0xFFU);
+    return unpackInt4x8ToInt32(packed);
 }
 
-//@hidden:
+/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers.
 [__readNone]
 [ForceInline]
-int16_t __lsb_as_s16(uint32_t val)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint32_t4 unpack_u8u32(uint8_t4_packed packed)
 {
-    return int16_t(__lsb_as_s8(val));
+    return unpackUint4x8ToUint32(packed);
 }
 
-//@hidden:
+/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
 [__readNone]
 [ForceInline]
-int32_t __lsb_as_s32(uint32_t val)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_u8(uint32_t4 unpackedValue)
 {
-    return int32_t(__lsb_as_s8(val));
+    return packUint4x8(unpackedValue);
 }
 
-//@hidden:
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_s8(int32_t4 unpackedValue)
+{
+    return packInt4x8(unpackedValue);
+}
+
+/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_u8(uint16_t4 unpackedValue)
+{
+    return packUint4x8(unpackedValue);
+}
+
+/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_s8(int16_t4 unpackedValue)
+{
+    return packInt4x8(unpackedValue);
+}
+
+/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue)
+{
+    return packUint4x8Clamp(unpackedValue);
+}
+
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue)
+{
+    return packInt4x8Clamp(unpackedValue);
+}
+
+/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
 [__readNone]
 [ForceInline]
-uint32_t __lsb_clamp_u8_as_u32(int32_t val)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue)
 {
-    return clamp(val, 0, 255);
+    return packUint4x8Clamp(unpackedValue);
 }
 
+/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue)
+{
+    return packInt4x8Clamp(unpackedValue);
+}
+
+// Work-graphs
+
+//@public:
+/// read-only input to Broadcasting launch node.
+__generic<T>
+//TODO: DispatchNodeInputRecord should be available only for broadcasting node shader.
+//[require(broadcasting_node)]
+[require(spirv)]
+struct DispatchNodeInputRecord
+{
+    /// Provide an access to a record object that only holds a single record.
+    NodePayloadPtr<T> Get()
+    {
+        int index = 0;
+        __target_switch
+        {
+        case spirv:
+            return spirv_asm
+            {
+                %in_payload_t = OpTypeNodePayloadArrayAMDX $$T;
+                %in_payload_ptr_t = OpTypePointer NodePayloadAMDX %in_payload_t;
+                %var = OpVariable %in_payload_ptr_t NodePayloadAMDX;
+                result : $$NodePayloadPtr<T> = OpAccessChain %var $index;
+            };
+        }
+    }
+};
+
+//
+// Pack/Unpack Intrinsics
+//
+
 //@hidden:
+
+[__readNone]
+[ForceInline]
+uint16_t __lsbAsUint16(uint32_t val)
+{
+    return uint16_t(val & 0xFFU);
+}
+
+[__readNone]
+[ForceInline]
+uint32_t __lsbAsUint32(uint32_t val)
+{
+    return (val & 0xFFU);
+}
+
+[__readNone]
+[ForceInline]
+int8_t __lsbAsInt8(uint32_t val)
+{
+    return int8_t(val);
+}
+
 [__readNone]
 [ForceInline]
-uint32_t __lsb_clamp_s8_as_u32(int32_t val)
+int16_t __lsbAsInt16(uint32_t val)
 {
-    return (uint32_t(clamp(val, -128, 127)) & 0xFFU);
+    return int16_t(__lsbAsInt8(val));
+}
+
+[__readNone]
+[ForceInline]
+int32_t __lsbAsInt32(uint32_t val)
+{
+    return int32_t(__lsbAsInt8(val));
 }
 
 //@public:
-/// Unpack 4 signed 8-bit values into a vector of 16 bit integers.
+
+/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-int16_t4 unpack_s8s16(int8_t4_packed packed)
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint32_t4 unpackUint4x8ToUint32(uint packedValue)
 {
     __target_switch
     {
-    case hlsl: __intrinsic_asm "unpack_s8s16";
-    case spirv: 
+    case hlsl: __intrinsic_asm "unpack_u8u32";
+    case wgsl: __intrinsic_asm "unpack4xU8";
+    case spirv:
         return spirv_asm
         {
-            %s8Vec = OpBitcast $$vector<int8_t, 4> $packed;
-            result:$$vector<int16_t, 4> = OpSConvert %s8Vec
+            %u8Vec = OpBitcast $$vector<uint8_t, 4> $packedValue;
+            result:$$vector<uint32_t, 4> = OpUConvert %u8Vec
         };
     default:
-        uint32_t packedValue = uint32_t(packed);
-        return int16_t4
+        return uint32_t4
         (
-            __lsb_as_s16(packedValue),
-            __lsb_as_s16(packedValue >> 8U),
-            __lsb_as_s16(packedValue >> 16U),
-            __lsb_as_s16(packedValue >> 24U),
+            __lsbAsUint32(packedValue),
+            __lsbAsUint32(packedValue >> 8U),
+            __lsbAsUint32(packedValue >> 16U),
+            uint32_t(packedValue >> 24U),
         );
     }
 }
 
-//@public:
 /// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-uint16_t4 unpack_u8u16(uint8_t4_packed packed)
+uint16_t4 unpackUint4x8ToUint16(uint packedValue)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "unpack_u8u16";
-    case spirv: 
+    case spirv:
         return spirv_asm
         {
-            %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed;
+            %u8Vec = OpBitcast $$vector<uint8_t, 4> $packedValue;
             result:$$vector<uint16_t, 4> = OpUConvert %u8Vec
         };
     default:
-        uint32_t packedValue = uint32_t(packed);
         return uint16_t4
         (
-            __lsb_as_u16(packedValue),
-            __lsb_as_u16(packedValue >> 8U),
-            __lsb_as_u16(packedValue >> 16U),
-            __lsb_as_u16(packedValue >> 24U),
+            __lsbAsUint16(packedValue),
+            __lsbAsUint16(packedValue >> 8U),
+            __lsbAsUint16(packedValue >> 16U),
+            uint16_t(packedValue >> 24U),
         );
     }
 }
 
-//@public:
 /// Unpack 4 signed 8-bit values into a vector of 32 bit integers.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-int32_t4 unpack_s8s32(int8_t4_packed packed)
+int32_t4 unpackInt4x8ToInt32(uint packedValue)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "unpack_s8s32";
     case wgsl: __intrinsic_asm "unpack4xI8";
-    case spirv: 
+    case spirv:
         return spirv_asm
         {
-            %s8Vec = OpBitcast $$vector<int8_t, 4> $packed;
+            %s8Vec = OpBitcast $$vector<int8_t, 4> $packedValue;
             result:$$vector<int32_t, 4> = OpSConvert %s8Vec
         };
     default:
-        uint32_t packedValue = uint32_t(packed);
         return int32_t4
         (
-            __lsb_as_s32(packedValue),
-            __lsb_as_s32(packedValue >> 8U),
-            __lsb_as_s32(packedValue >> 16U),
-            __lsb_as_s32(packedValue >> 24U),
+            __lsbAsInt32(packedValue),
+            __lsbAsInt32(packedValue >> 8U),
+            __lsbAsInt32(packedValue >> 16U),
+            int32_t(int8_t(packedValue >> 24U)),
         );
     }
 }
 
-//@public:
-/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers.
+/// Unpack 4 signed 8-bit values into a vector of 16 bit integers.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-uint32_t4 unpack_u8u32(uint8_t4_packed packed)
+int16_t4 unpackInt4x8ToInt16(uint packedValue)
 {
     __target_switch
     {
-    case hlsl: __intrinsic_asm "unpack_u8u32";
-    case wgsl: __intrinsic_asm "unpack4xU8";
-    case spirv: 
+    case hlsl: __intrinsic_asm "unpack_s8s16";
+    case spirv:
         return spirv_asm
         {
-            %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed;
-            result:$$vector<uint32_t, 4> = OpUConvert %u8Vec
+            %s8Vec = OpBitcast $$vector<int8_t, 4> $packedValue;
+            result:$$vector<int16_t, 4> = OpSConvert %s8Vec
         };
     default:
-        uint32_t packedValue = uint32_t(packed);
-        return uint32_t4
+        return int16_t4
         (
-            __lsb_as_u32(packedValue),
-            __lsb_as_u32(packedValue >> 8U),
-            __lsb_as_u32(packedValue >> 16U),
-            __lsb_as_u32(packedValue >> 24U),
+            __lsbAsInt16(packedValue),
+            __lsbAsInt16(packedValue >> 8U),
+            __lsbAsInt16(packedValue >> 16U),
+            int16_t(int8_t(packedValue >> 24U)),
         );
     }
 }
 
-//@public:
 /// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-uint8_t4_packed pack_u8(uint32_t4 unpackedValue)
+uint packUint4x8(uint32_t4 unpackedValue)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "pack_u8";
     case wgsl: __intrinsic_asm "pack4xU8";
     default:
-        return uint8_t4_packed
-        (
-            __lsb_as_u32(unpackedValue.x)
-            | (__lsb_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_as_u32(unpackedValue.w) << 24U)
-        );
+        return __lsbAsUint32(unpackedValue.x)
+            | (__lsbAsUint32(unpackedValue.y) << 8U)
+            | (__lsbAsUint32(unpackedValue.z) << 16U)
+            | (unpackedValue.w << 24U);
     }
 }
 
-//@public:
-/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-int8_t4_packed pack_s8(int32_t4 unpackedValue)
+uint packUint4x8(uint16_t4 unpackedValue)
 {
     __target_switch
     {
-    case hlsl: __intrinsic_asm "pack_s8";
-    case wgsl: __intrinsic_asm "pack4xI8";
+    case hlsl: __intrinsic_asm "pack_u8";
     default:
-        return int8_t4_packed
-        (
-            __lsb_as_u32(unpackedValue.x)
-            | (__lsb_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_as_u32(unpackedValue.w) << 24U)
-        );
+        return packUint4x8(uint32_t4(unpackedValue));
     }
 }
 
-//@public:
-/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-uint8_t4_packed pack_u8(uint16_t4 unpackedValue)
+uint packInt4x8(int32_t4 unpackedValue)
 {
     __target_switch
     {
-    case hlsl: __intrinsic_asm "pack_u8";
+    case hlsl: __intrinsic_asm "pack_s8";
+    case wgsl: __intrinsic_asm "pack4xI8";
     default:
-        return uint8_t4_packed
-        (
-            __lsb_as_u32(unpackedValue.x)
-            | (__lsb_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_as_u32(unpackedValue.w) << 24U)
-        );
+        return packUint4x8(uint32_t4(unpackedValue));
     }
 }
 
-//@public:
 /// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-int8_t4_packed pack_s8(int16_t4 unpackedValue)
+uint packInt4x8(int16_t4 unpackedValue)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "pack_s8";
     default:
-        return int8_t4_packed
-        (
-            __lsb_as_u32(unpackedValue.x)
-            | (__lsb_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_as_u32(unpackedValue.w) << 24U)
-        );
+        return packUint4x8(uint32_t4(unpackedValue));
     }
 }
 
-//@public:
-/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers,
-/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue)
+uint packUint4x8Clamp(int32_t4 unpackedValue)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "pack_clamp_u8";
     case wgsl: __intrinsic_asm "pack4xU8Clamp(vec4<u32>($0))";
     default:
-        return uint8_t4_packed
-        (
-            __lsb_clamp_u8_as_u32(unpackedValue.x)
-            | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U)
-        );
+        return packInt4x8(clamp(unpackedValue, 0, 255));
     }
 }
 
-//@public:
-/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers,
-/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
+/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue)
+uint packUint4x8Clamp(int16_t4 unpackedValue)
 {
     __target_switch
     {
-    case hlsl: __intrinsic_asm "pack_clamp_s8";
-    case wgsl: __intrinsic_asm "pack4xI8Clamp";
+    case hlsl: __intrinsic_asm "pack_clamp_u8";
     default:
-        return int8_t4_packed
-        (
-            __lsb_clamp_s8_as_u32(unpackedValue.x)
-            | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U)
-        );
+        return packInt4x8(clamp(unpackedValue, 0, 255));
     }
 }
 
-//@public:
-/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers,
-/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue)
+uint packInt4x8Clamp(int32_t4 unpackedValue)
 {
     __target_switch
     {
-    case hlsl: __intrinsic_asm "pack_clamp_u8";
+    case hlsl: __intrinsic_asm "pack_clamp_s8";
+    case wgsl: __intrinsic_asm "pack4xI8Clamp";
     default:
-        return uint8_t4_packed
-        (
-            __lsb_clamp_u8_as_u32(unpackedValue.x)
-            | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U)
-        );
+        return packInt4x8(clamp(unpackedValue, -128, 127));
     }
 }
 
-//@public:
 /// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers,
 /// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
 [__readNone]
 [ForceInline]
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
-int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue)
+uint packInt4x8Clamp(int16_t4 unpackedValue)
 {
     __target_switch
     {
     case hlsl: __intrinsic_asm "pack_clamp_s8";
     default:
-        return int8_t4_packed
-        (
-            __lsb_clamp_s8_as_u32(unpackedValue.x)
-            | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U)
-            | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U)
-            | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U)
-        );
+        return packInt4x8(clamp(unpackedValue, -128, 127));
     }
 }
-
-// Work-graphs
-
-//@public:
-/// read-only input to Broadcasting launch node.
-__generic<T>
-//TODO: DispatchNodeInputRecord should be available only for broadcasting node shader.
-//[require(broadcasting_node)]
-[require(spirv)]
-struct DispatchNodeInputRecord
-{
-    /// Provide an access to a record object that only holds a single record.
-    NodePayloadPtr<T> Get()
-    {
-        int index = 0;
-        __target_switch
-        {
-        case spirv:
-            return spirv_asm
-            {
-                %in_payload_t = OpTypeNodePayloadArrayAMDX $$T;
-                %in_payload_ptr_t = OpTypePointer NodePayloadAMDX %in_payload_t;
-                %var = OpVariable %in_payload_ptr_t NodePayloadAMDX;
-                result : $$NodePayloadPtr<T> = OpAccessChain %var $index;
-            };
-        }
-    }
-};
-
diff --git a/source/slang/slang-check-conversion.cpp b/source/slang/slang-check-conversion.cpp
index a9785a585..c5cf192ee 100644
--- a/source/slang/slang-check-conversion.cpp
+++ b/source/slang/slang-check-conversion.cpp
@@ -1026,8 +1026,6 @@ int getTypeBitSize(Type* t)
         return 16;
     case BaseType::Int:
     case BaseType::UInt:
-    case BaseType::Int8x4Packed:
-    case BaseType::UInt8x4Packed:
         return 32;
     case BaseType::Int64:
     case BaseType::UInt64:
diff --git a/source/slang/slang-check-decl.cpp b/source/slang/slang-check-decl.cpp
index 486ac6e9c..5ee8ba3ef 100644
--- a/source/slang/slang-check-decl.cpp
+++ b/source/slang/slang-check-decl.cpp
@@ -1981,8 +1981,6 @@ void SemanticsDeclHeaderVisitor::checkVarDeclCommon(VarDeclBase* varDecl)
             case BaseType::UInt:
             case BaseType::UInt64:
             case BaseType::UIntPtr:
-            case BaseType::Int8x4Packed:
-            case BaseType::UInt8x4Packed:
                 break;
             default:
                 getSink()->diagnose(varDecl, Diagnostics::staticConstRequirementMustBeIntOrBool);
diff --git a/source/slang/slang-emit-c-like.cpp b/source/slang/slang-emit-c-like.cpp
index 1c48d98ef..ff40d5b28 100644
--- a/source/slang/slang-emit-c-like.cpp
+++ b/source/slang/slang-emit-c-like.cpp
@@ -266,11 +266,6 @@ void CLikeSourceEmitter::emitSimpleType(IRType* type)
     case kIROp_UIntPtrType:
         return UnownedStringSlice("uintptr_t");
 
-    case kIROp_Int8x4PackedType:
-        return UnownedStringSlice("int8_t4_packed");
-    case kIROp_UInt8x4PackedType:
-        return UnownedStringSlice("uint8_t4_packed");
-
     case kIROp_HalfType:
         return UnownedStringSlice("half");
 
@@ -1334,8 +1329,6 @@ void CLikeSourceEmitter::emitSimpleValueImpl(IRInst* inst)
                         return;
                     }
                 case BaseType::UInt:
-                case BaseType::Int8x4Packed:
-                case BaseType::UInt8x4Packed:
                     {
                         m_writer->emit(UInt(uint32_t(litInst->value.intVal)));
                         m_writer->emit("U");
@@ -4045,8 +4038,6 @@ void CLikeSourceEmitter::emitVecNOrScalar(
                 m_writer->emit("ushort");
                 break;
             case kIROp_UIntType:
-            case kIROp_Int8x4PackedType:
-            case kIROp_UInt8x4PackedType:
                 m_writer->emit("uint");
                 break;
             case kIROp_UInt64Type:
diff --git a/source/slang/slang-emit-cpp.cpp b/source/slang/slang-emit-cpp.cpp
index 13a85e8ab..bfc021677 100644
--- a/source/slang/slang-emit-cpp.cpp
+++ b/source/slang/slang-emit-cpp.cpp
@@ -101,13 +101,8 @@ static const char s_xyzwNames[] = "xyzw";
     case kIROp_UIntPtrType:
         return UnownedStringSlice("uintptr_t");
 
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
-        return UnownedStringSlice("uint32_t");
-
         // Not clear just yet how we should handle half... we want all processing as float
         // probly, but when reading/writing to memory converting
-
     case kIROp_HalfType:
         return UnownedStringSlice("half");
 
diff --git a/source/slang/slang-emit-cuda.cpp b/source/slang/slang-emit-cuda.cpp
index 58ac377bf..8657b3707 100644
--- a/source/slang/slang-emit-cuda.cpp
+++ b/source/slang/slang-emit-cuda.cpp
@@ -84,9 +84,6 @@ UnownedStringSlice CUDASourceEmitter::getBuiltinTypeName(IROp op)
     case kIROp_UIntPtrType:
         return UnownedStringSlice("uint");
 #endif
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
-        return UnownedStringSlice("uint");
 
     case kIROp_HalfType:
         return UnownedStringSlice("__half");
diff --git a/source/slang/slang-emit-glsl.cpp b/source/slang/slang-emit-glsl.cpp
index 696830bf2..fca5a8933 100644
--- a/source/slang/slang-emit-glsl.cpp
+++ b/source/slang/slang-emit-glsl.cpp
@@ -1330,8 +1330,6 @@ void GLSLSourceEmitter::emitSimpleValueImpl(IRInst* inst)
                         return;
                     }
                 case BaseType::UInt:
-                case BaseType::Int8x4Packed:
-                case BaseType::UInt8x4Packed:
                     {
                         m_writer->emit(UInt(uint32_t(litInst->value.intVal)));
                         m_writer->emit("U");
@@ -2175,8 +2173,6 @@ bool GLSLSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
                 break;
 
             case BaseType::UInt:
-            case BaseType::Int8x4Packed:
-            case BaseType::UInt8x4Packed:
                 if (fromType == BaseType::Float)
                 {
                     m_writer->emit("floatBitsToUint");
@@ -3186,18 +3182,6 @@ void GLSLSourceEmitter::emitSimpleTypeImpl(IRType* type)
 #endif
             return;
         }
-    case kIROp_Int8x4PackedType:
-        {
-            _requireBaseType(BaseType::Int8x4Packed);
-            m_writer->emit("uint");
-            return;
-        }
-    case kIROp_UInt8x4PackedType:
-        {
-            _requireBaseType(BaseType::UInt8x4Packed);
-            m_writer->emit("uint");
-            return;
-        }
     case kIROp_VoidType:
     case kIROp_BoolType:
     case kIROp_Int8Type:
diff --git a/source/slang/slang-emit-hlsl.cpp b/source/slang/slang-emit-hlsl.cpp
index 59d40d3a1..89300e13e 100644
--- a/source/slang/slang-emit-hlsl.cpp
+++ b/source/slang/slang-emit-hlsl.cpp
@@ -902,8 +902,6 @@ bool HLSLSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
             case BaseType::UInt64:
             case BaseType::UIntPtr:
             case BaseType::Bool:
-            case BaseType::Int8x4Packed:
-            case BaseType::UInt8x4Packed:
                 // Because the intermediate type will always
                 // be an integer type, we can convert to
                 // another integer type of the same size
@@ -943,8 +941,6 @@ bool HLSLSourceEmitter::tryEmitInstExprImpl(IRInst* inst, const EmitOpInfo& inOu
             case BaseType::UInt:
             case BaseType::Int:
             case BaseType::Bool:
-            case BaseType::Int8x4Packed:
-            case BaseType::UInt8x4Packed:
                 break;
             case BaseType::UInt16:
             case BaseType::Int16:
@@ -1330,8 +1326,6 @@ void HLSLSourceEmitter::emitSimpleTypeImpl(IRType* type)
     case kIROp_Int16Type:
     case kIROp_UInt16Type:
     case kIROp_HalfType:
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
         {
             m_writer->emit(getDefaultBuiltinTypeName(type->getOp()));
             return;
diff --git a/source/slang/slang-emit-metal.cpp b/source/slang/slang-emit-metal.cpp
index 0a7db8b28..1bb738346 100644
--- a/source/slang/slang-emit-metal.cpp
+++ b/source/slang/slang-emit-metal.cpp
@@ -1099,10 +1099,6 @@ void MetalSourceEmitter::emitSimpleTypeImpl(IRType* type)
     case kIROp_UIntPtrType:
         m_writer->emit("ulong");
         return;
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
-        m_writer->emit("uint");
-        return;
     case kIROp_StructType:
         m_writer->emit(getName(type));
         return;
diff --git a/source/slang/slang-emit-spirv.cpp b/source/slang/slang-emit-spirv.cpp
index 802df915e..c7e222247 100644
--- a/source/slang/slang-emit-spirv.cpp
+++ b/source/slang/slang-emit-spirv.cpp
@@ -1466,8 +1466,6 @@ struct SPIRVEmitContext : public SourceEmitterBase, public SPIRVEmitSharedContex
         case kIROp_Int8Type:
         case kIROp_IntType:
         case kIROp_Int64Type:
-        case kIROp_Int8x4PackedType:
-        case kIROp_UInt8x4PackedType:
             {
                 const IntInfo i = getIntTypeInfo(as<IRType>(inst));
                 if (i.width == 16)
@@ -7642,8 +7640,6 @@ struct SPIRVEmitContext : public SourceEmitterBase, public SPIRVEmitSharedContex
             case kIROp_UInt64Type:
             case kIROp_UInt8Type:
             case kIROp_UIntPtrType:
-            case kIROp_Int8x4PackedType:
-            case kIROp_UInt8x4PackedType:
                 spvEncoding = 6; // Unsigned
                 break;
             case kIROp_FloatType:
diff --git a/source/slang/slang-emit-wgsl.cpp b/source/slang/slang-emit-wgsl.cpp
index 7c83b194d..d87cd06de 100644
--- a/source/slang/slang-emit-wgsl.cpp
+++ b/source/slang/slang-emit-wgsl.cpp
@@ -511,10 +511,6 @@ void WGSLSourceEmitter::emitSimpleTypeImpl(IRType* type)
     case kIROp_UIntPtrType:
         m_writer->emit("u64");
         return;
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
-        m_writer->emit("u32");
-        return;
     case kIROp_StructType:
         m_writer->emit(getName(type));
         return;
@@ -967,8 +963,6 @@ void WGSLSourceEmitter::emitSimpleValueImpl(IRInst* inst)
                         return;
                     }
                 case BaseType::UInt:
-                case BaseType::Int8x4Packed:
-                case BaseType::UInt8x4Packed:
                     {
                         m_writer->emit("u32(");
                         m_writer->emit(UInt(uint32_t(litInst->value.intVal)));
diff --git a/source/slang/slang-ir-any-value-marshalling.cpp b/source/slang/slang-ir-any-value-marshalling.cpp
index 2dc91d299..d124b293d 100644
--- a/source/slang/slang-ir-any-value-marshalling.cpp
+++ b/source/slang/slang-ir-any-value-marshalling.cpp
@@ -153,8 +153,6 @@ struct AnyValueMarshallingContext
         case kIROp_IntPtrType:
         case kIROp_UIntPtrType:
         case kIROp_PtrType:
-        case kIROp_Int8x4PackedType:
-        case kIROp_UInt8x4PackedType:
             context->marshalBasicType(builder, dataType, concreteTypedVar);
             break;
         case kIROp_VectorType:
@@ -311,8 +309,6 @@ struct AnyValueMarshallingContext
                     break;
                 }
             case kIROp_UIntType:
-            case kIROp_Int8x4PackedType:
-            case kIROp_UInt8x4PackedType:
 #if SLANG_PTR_IS_32
             case kIROp_UIntPtrType:
 #endif
@@ -560,8 +556,6 @@ struct AnyValueMarshallingContext
                     break;
                 }
             case kIROp_UIntType:
-            case kIROp_Int8x4PackedType:
-            case kIROp_UInt8x4PackedType:
                 {
                     ensureOffsetAt4ByteBoundary();
                     if (fieldOffset < static_cast<uint32_t>(anyValInfo->fieldKeys.getCount()))
@@ -861,8 +855,6 @@ SlangInt _getAnyValueSizeRaw(IRType* type, SlangInt offset)
     case kIROp_FloatType:
     case kIROp_UIntType:
     case kIROp_BoolType:
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
         return alignUp(offset, 4) + 4;
     case kIROp_UInt64Type:
     case kIROp_Int64Type:
diff --git a/source/slang/slang-ir-byte-address-legalize.cpp b/source/slang/slang-ir-byte-address-legalize.cpp
index 9207e6a2f..617c8c7c4 100644
--- a/source/slang/slang-ir-byte-address-legalize.cpp
+++ b/source/slang/slang-ir-byte-address-legalize.cpp
@@ -840,8 +840,6 @@ struct ByteAddressBufferLegalizationContext
         case kIROp_IntType:
         case kIROp_FloatType:
         case kIROp_BoolType:
-        case kIROp_Int8x4PackedType:
-        case kIROp_UInt8x4PackedType:
             // The basic 32-bit types (and `bool`) can be handled by
             // loading `uint` values and then bit-casting.
             //
diff --git a/source/slang/slang-ir-layout.cpp b/source/slang/slang-ir-layout.cpp
index 1332c8a25..7ce19bf67 100644
--- a/source/slang/slang-ir-layout.cpp
+++ b/source/slang/slang-ir-layout.cpp
@@ -128,9 +128,6 @@ static Result _calcSizeAndAlignment(
         BASE(UIntPtr, kPointerSize);
         BASE(Double, 8);
 
-        BASE(Int8x4Packed, 4);
-        BASE(UInt8x4Packed, 4);
-
         // We are currently handling `bool` following the HLSL
         // precednet of storing it in 4 bytes.
         //
diff --git a/source/slang/slang-ir-lower-bit-cast.cpp b/source/slang/slang-ir-lower-bit-cast.cpp
index 0a7783639..450ad8ac9 100644
--- a/source/slang/slang-ir-lower-bit-cast.cpp
+++ b/source/slang/slang-ir-lower-bit-cast.cpp
@@ -178,8 +178,6 @@ struct BitCastLoweringContext
         case kIROp_UIntType:
         case kIROp_FloatType:
         case kIROp_BoolType:
-        case kIROp_Int8x4PackedType:
-        case kIROp_UInt8x4PackedType:
 #if SLANG_PTR_IS_32
         case kIROp_IntPtrType:
         case kIROp_UIntPtrType:
diff --git a/source/slang/slang-ir-util.cpp b/source/slang/slang-ir-util.cpp
index 39c1c5bb1..f75a24ac6 100644
--- a/source/slang/slang-ir-util.cpp
+++ b/source/slang/slang-ir-util.cpp
@@ -107,8 +107,6 @@ IROp getTypeStyle(IROp op)
     case kIROp_UInt64Type:
     case kIROp_IntPtrType:
     case kIROp_UIntPtrType:
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
         {
             // All int like
             return kIROp_IntType;
@@ -144,8 +142,6 @@ IROp getTypeStyle(BaseType op)
     case BaseType::UInt:
     case BaseType::UInt64:
     case BaseType::UIntPtr:
-    case BaseType::Int8x4Packed:
-    case BaseType::UInt8x4Packed:
         return kIROp_IntType;
     case BaseType::Half:
     case BaseType::Float:
@@ -476,12 +472,6 @@ void getTypeNameHint(StringBuilder& sb, IRInst* type)
     case kIROp_UIntPtrType:
         sb << "uintptr";
         break;
-    case kIROp_Int8x4PackedType:
-        sb << "int8_t4_packed";
-        break;
-    case kIROp_UInt8x4PackedType:
-        sb << "uint8_t4_packed";
-        break;
     case kIROp_CharType:
         sb << "char";
         break;
@@ -1862,10 +1852,6 @@ UnownedStringSlice getBasicTypeNameHint(IRType* basicType)
         return UnownedStringSlice::fromLiteral("uint64");
     case kIROp_UIntPtrType:
         return UnownedStringSlice::fromLiteral("uintptr");
-    case kIROp_Int8x4PackedType:
-        return UnownedStringSlice::fromLiteral("int8_t4_packed");
-    case kIROp_UInt8x4PackedType:
-        return UnownedStringSlice::fromLiteral("uint8_t4_packed");
     case kIROp_FloatType:
         return UnownedStringSlice::fromLiteral("float");
     case kIROp_HalfType:
diff --git a/source/slang/slang-ir.cpp b/source/slang/slang-ir.cpp
index 1b0f99cd3..b982385fa 100644
--- a/source/slang/slang-ir.cpp
+++ b/source/slang/slang-ir.cpp
@@ -3814,8 +3814,6 @@ IRInst* IRBuilder::emitDefaultConstruct(IRType* type, bool fallback)
     case kIROp_UIntType:
     case kIROp_UIntPtrType:
     case kIROp_UInt64Type:
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
     case kIROp_CharType:
         return getIntValue(type, 0);
     case kIROp_BoolType:
@@ -7543,8 +7541,6 @@ bool isIntegralType(IRType* t)
         case BaseType::UInt64:
         case BaseType::IntPtr:
         case BaseType::UIntPtr:
-        case BaseType::Int8x4Packed:
-        case BaseType::UInt8x4Packed:
             return true;
         default:
             return false;
@@ -7591,10 +7587,6 @@ IntInfo getIntTypeInfo(const IRType* intType)
     case kIROp_Int64Type:
         return {64, true};
 
-    case kIROp_Int8x4PackedType:
-    case kIROp_UInt8x4PackedType:
-        return {32, false};
-
     case kIROp_IntPtrType:  // target platform dependent
     case kIROp_UIntPtrType: // target platform dependent
     default:
diff --git a/source/slang/slang-lower-to-ir.cpp b/source/slang/slang-lower-to-ir.cpp
index e5037bf04..e5ca77634 100644
--- a/source/slang/slang-lower-to-ir.cpp
+++ b/source/slang/slang-lower-to-ir.cpp
@@ -4633,8 +4633,6 @@ struct ExprLoweringVisitorBase : public ExprVisitor<Derived, LoweredValInfo>
             case BaseType::UInt64:
             case BaseType::UIntPtr:
             case BaseType::IntPtr:
-            case BaseType::Int8x4Packed:
-            case BaseType::UInt8x4Packed:
                 return LoweredValInfo::simple(getBuilder()->getIntValue(type, 0));
 
             case BaseType::Half:
diff --git a/source/slang/slang-mangle.cpp b/source/slang/slang-mangle.cpp
index dedbb2d48..d51fafb6b 100644
--- a/source/slang/slang-mangle.cpp
+++ b/source/slang/slang-mangle.cpp
@@ -186,13 +186,6 @@ void emitBaseType(ManglingContext* context, BaseType baseType)
     case BaseType::IntPtr:
         emitRaw(context, "ip");
         break;
-    case BaseType::Int8x4Packed:
-        emitRaw(context, "c4p");
-        break;
-    case BaseType::UInt8x4Packed:
-        emitRaw(context, "C4p");
-        break;
-
     default:
         SLANG_UNEXPECTED("unimplemented case in base type mangling");
         break;
diff --git a/source/slang/slang-type-layout.cpp b/source/slang/slang-type-layout.cpp
index 7968450b4..e6729ca85 100644
--- a/source/slang/slang-type-layout.cpp
+++ b/source/slang/slang-type-layout.cpp
@@ -111,10 +111,6 @@ struct DefaultLayoutRulesImpl : SimpleLayoutRulesImpl
                 sizeof(intptr_t),
                 sizeof(intptr_t));
 
-        case BaseType::Int8x4Packed:
-        case BaseType::UInt8x4Packed:
-            return SimpleLayoutInfo(LayoutResourceKind::Uniform, 4, 4);
-
         case BaseType::Half:
             return SimpleLayoutInfo(LayoutResourceKind::Uniform, 2, 2);
         case BaseType::Float:
diff --git a/source/slang/slang-type-system-shared.h b/source/slang/slang-type-system-shared.h
index 583eb2216..d7bd43122 100644
--- a/source/slang/slang-type-system-shared.h
+++ b/source/slang/slang-type-system-shared.h
@@ -22,8 +22,6 @@ namespace Slang
     X(Char)                  \
     X(IntPtr)                \
     X(UIntPtr)               \
-    X(Int8x4Packed)          \
-    X(UInt8x4Packed)         \
     /* end */
 
 enum class BaseType
diff --git a/source/slang/slang.cpp b/source/slang/slang.cpp
index fe7ecc4e9..924a4371a 100644
--- a/source/slang/slang.cpp
+++ b/source/slang/slang.cpp
@@ -82,8 +82,6 @@ namespace Slang
      BaseTypeInfo::Flag::Signed | BaseTypeInfo::Flag::Integer,
      uint8_t(BaseType::IntPtr)},
     {uint8_t(sizeof(uintptr_t)), BaseTypeInfo::Flag::Integer, uint8_t(BaseType::UIntPtr)},
-    {uint8_t(sizeof(uint32_t)), BaseTypeInfo::Flag::Integer, uint8_t(BaseType::Int8x4Packed)},
-    {uint8_t(sizeof(uint32_t)), BaseTypeInfo::Flag::Integer, uint8_t(BaseType::UInt8x4Packed)},
 };
 
 /* static */ bool BaseTypeInfo::check()
@@ -135,10 +133,6 @@ namespace Slang
         return UnownedStringSlice::fromLiteral("intptr_t");
     case BaseType::UIntPtr:
         return UnownedStringSlice::fromLiteral("uintptr_t");
-    case BaseType::Int8x4Packed:
-        return UnownedStringSlice::fromLiteral("int8_t4_packed");
-    case BaseType::UInt8x4Packed:
-        return UnownedStringSlice::fromLiteral("uint8_t4_packed");
     default:
         {
             SLANG_ASSERT(!"Unknown basic type");