Implement HLSL pack/unpack math intrinsics (#5934)

author: Darren Wihandi <65404740+fairywreath@users.noreply.github.com> 2024-12-28 14:33:16 -0500
committer: GitHub <noreply@github.com> 2024-12-28 19:33:16 +0000
commit: 7a6de4aea1973b379d9f3b7db248ad260d3ee024 (patch)
tree: 081d4ef6ee9b91f7478ad55c3a6483e4a1004b37
parent: c4429bc33450be32ed82358c3974da58e5ec25ab (diff)
3 files changed, 536 insertions, 0 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 85a81eabf..774c6a247 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -20806,3 +20806,357 @@ T workgroupUniformLoad<T>(__ref T v)
         return v;
     }
 }
+
+//
+// Pack/Unpack Math Intrinsics
+//
+// These were introduced in SM 6.6 but requirements are dropped to SM 5.0 here
+// to expose these intrinsics on targets that do not have SM 6.6 features.
+//
+
+//@hidden:
+[__readNone]
+[ForceInline]
+uint16_t __lsb_as_u16(uint32_t val)
+{
+    return uint16_t(val & 0xFFU);
+}
+
+//@hidden:
+[__readNone]
+[ForceInline]
+uint32_t __lsb_as_u32(uint32_t val)
+{
+    return (val & 0xFFU);
+}
+
+//@hidden:
+[__readNone]
+[ForceInline]
+int8_t __lsb_as_s8(uint32_t val)
+{
+    return int8_t(val & 0xFFU);
+}
+
+//@hidden:
+[__readNone]
+[ForceInline]
+int16_t __lsb_as_s16(uint32_t val)
+{
+    return int16_t(__lsb_as_s8(val));
+}
+
+//@hidden:
+[__readNone]
+[ForceInline]
+int32_t __lsb_as_s32(uint32_t val)
+{
+    return int32_t(__lsb_as_s8(val));
+}
+
+//@hidden:
+[__readNone]
+[ForceInline]
+uint32_t __lsb_clamp_u8_as_u32(int32_t val)
+{
+    return clamp(val, 0, 255);
+}
+
+//@hidden:
+[__readNone]
+[ForceInline]
+uint32_t __lsb_clamp_s8_as_u32(int32_t val)
+{
+    return (uint32_t(clamp(val, -128, 127)) & 0xFFU);
+}
+
+//@public:
+/// Unpack 4 signed 8-bit values into a vector of 16 bit integers.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int16_t4 unpack_s8s16(int8_t4_packed packed)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "unpack_s8s16";
+    case spirv: 
+        return spirv_asm
+        {
+            %s8Vec = OpBitcast $$vector<int8_t, 4> $packed;
+            result:$$vector<int16_t, 4> = OpSConvert %s8Vec
+        };
+    default:
+        uint32_t packedValue = uint32_t(packed);
+        return int16_t4
+        (
+            __lsb_as_s16(packedValue),
+            __lsb_as_s16(packedValue >> 8U),
+            __lsb_as_s16(packedValue >> 16U),
+            __lsb_as_s16(packedValue >> 24U),
+        );
+    }
+}
+
+//@public:
+/// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint16_t4 unpack_u8u16(uint8_t4_packed packed)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "unpack_u8u16";
+    case spirv: 
+        return spirv_asm
+        {
+            %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed;
+            result:$$vector<uint16_t, 4> = OpUConvert %u8Vec
+        };
+    default:
+        uint32_t packedValue = uint32_t(packed);
+        return uint16_t4
+        (
+            __lsb_as_u16(packedValue),
+            __lsb_as_u16(packedValue >> 8U),
+            __lsb_as_u16(packedValue >> 16U),
+            __lsb_as_u16(packedValue >> 24U),
+        );
+    }
+}
+
+//@public:
+/// Unpack 4 signed 8-bit values into a vector of 32 bit integers.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int32_t4 unpack_s8s32(int8_t4_packed packed)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "unpack_s8s32";
+    case wgsl: __intrinsic_asm "unpack4xI8";
+    case spirv: 
+        return spirv_asm
+        {
+            %s8Vec = OpBitcast $$vector<int8_t, 4> $packed;
+            result:$$vector<int32_t, 4> = OpSConvert %s8Vec
+        };
+    default:
+        uint32_t packedValue = uint32_t(packed);
+        return int32_t4
+        (
+            __lsb_as_s32(packedValue),
+            __lsb_as_s32(packedValue >> 8U),
+            __lsb_as_s32(packedValue >> 16U),
+            __lsb_as_s32(packedValue >> 24U),
+        );
+    }
+}
+
+//@public:
+/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint32_t4 unpack_u8u32(uint8_t4_packed packed)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "unpack_u8u32";
+    case wgsl: __intrinsic_asm "unpack4xU8";
+    case spirv: 
+        return spirv_asm
+        {
+            %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed;
+            result:$$vector<uint32_t, 4> = OpUConvert %u8Vec
+        };
+    default:
+        uint32_t packedValue = uint32_t(packed);
+        return uint32_t4
+        (
+            __lsb_as_u32(packedValue),
+            __lsb_as_u32(packedValue >> 8U),
+            __lsb_as_u32(packedValue >> 16U),
+            __lsb_as_u32(packedValue >> 24U),
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_u8(uint32_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_u8";
+    case wgsl: __intrinsic_asm "pack4xU8";
+    default:
+        return uint8_t4_packed
+        (
+            __lsb_as_u32(unpackedValue.x)
+            | (__lsb_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_s8(int32_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_s8";
+    case wgsl: __intrinsic_asm "pack4xI8";
+    default:
+        return int8_t4_packed
+        (
+            __lsb_as_u32(unpackedValue.x)
+            | (__lsb_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_u8(uint16_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_u8";
+    default:
+        return uint8_t4_packed
+        (
+            __lsb_as_u32(unpackedValue.x)
+            | (__lsb_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_s8(int16_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_s8";
+    default:
+        return int8_t4_packed
+        (
+            __lsb_as_u32(unpackedValue.x)
+            | (__lsb_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_clamp_u8";
+    case wgsl: __intrinsic_asm "pack4xU8Clamp(vec4<u32>($0))";
+    default:
+        return uint8_t4_packed
+        (
+            __lsb_clamp_u8_as_u32(unpackedValue.x)
+            | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_clamp_s8";
+    case wgsl: __intrinsic_asm "pack4xI8Clamp";
+    default:
+        return int8_t4_packed
+        (
+            __lsb_clamp_s8_as_u32(unpackedValue.x)
+            | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [0, 255] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_clamp_u8";
+    default:
+        return uint8_t4_packed
+        (
+            __lsb_clamp_u8_as_u32(unpackedValue.x)
+            | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
+//@public:
+/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers,
+/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue)
+{
+    __target_switch
+    {
+    case hlsl: __intrinsic_asm "pack_clamp_s8";
+    default:
+        return int8_t4_packed
+        (
+            __lsb_clamp_s8_as_u32(unpackedValue.x)
+            | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U)
+            | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U)
+            | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U)
+        );
+    }
+}
+
diff --git a/tests/hlsl-intrinsic/packed/pack-unpack.slang b/tests/hlsl-intrinsic/packed/pack-unpack.slang
new file mode 100644
index 000000000..b20e69fa8
--- /dev/null
+++ b/tests/hlsl-intrinsic/packed/pack-unpack.slang
@@ -0,0 +1,158 @@
+//TEST(compute):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -render-feature hardware-device
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -profile cs_6_6 -dx12 -use-dxil -shaderobj -render-feature hardware-device
+//TEST(compute):COMPARE_COMPUTE_EX:-metal -compute -shaderobj 
+//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
+
+// 16 bit variants are not supported by WGSL.
+//TEST(compute):COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj -xslang -DWGSL
+// Debug info for inlining errors can be given out, so disable them for this test.
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -g0
+
+//TEST_INPUT:ubuffer(data=[0xD37A83FF], stride=4):name unpackTestBuffer
+StructuredBuffer<uint32_t> unpackTestBuffer;
+
+//TEST_INPUT:ubuffer(data=[0xB3F 0x6A 0x123 0xD4], stride=4): name packTestBuffer
+StructuredBuffer<uint32_t4> packTestBuffer;
+
+// These should clamp to (5, 255, 255, 254) or (0x5, 0xFF, 0xFF, 0xFE).
+//TEST_INPUT:ubuffer(data=[5 256 12345 254], stride=4): name packClampUTestBuffer
+StructuredBuffer<int32_t4> packClampUTestBuffer;
+
+// These should clamp to (-1, 127, -128, 125) or (0xFF, 0x7F, 0x80, 0x81)
+// Inputs are [-1 250 -32768 -127].
+//TEST_INPUT:ubuffer(data=[0xFFFFFFFF 0xFA 0xFFFF8000 0xFFFFFF81], stride=4): name packClampSTestBuffer
+StructuredBuffer<int32_t4> packClampSTestBuffer;
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    const uint8_t4_packed unpackTestValue = uint8_t4_packed(unpackTestBuffer[0]);
+
+    uint index = 0;
+
+    /*
+     * Unpack without sign extension.
+     */
+    uint32_t4 unpackedU32 = unpack_u8u32(unpackTestValue);
+    // 0xFF
+    outputBuffer[index++] = uint(unpackedU32.x);
+    // 0x83
+    outputBuffer[index++] = uint(unpackedU32.y);
+    // 0x7A
+    outputBuffer[index++] = uint(unpackedU32.z);
+    // 0xD3
+    outputBuffer[index++] = uint(unpackedU32.w);
+
+#if !defined(WGSL)
+    uint16_t4 unpackedU16 = unpack_u8u16(unpackTestValue);
+    // 0xFF
+    outputBuffer[index++] = uint(unpackedU16.x);
+    // 0x83
+    outputBuffer[index++] = uint(unpackedU16.y);
+    // 0x7A
+    outputBuffer[index++] = uint(unpackedU16.z);
+    // 0xD3
+    outputBuffer[index++] = uint(unpackedU16.w);
+#else 
+    outputBuffer[index++] = 0xFFU;
+    outputBuffer[index++] = 0x83U;
+    outputBuffer[index++] = 0x7AU;
+    outputBuffer[index++] = 0xD3U;
+#endif
+
+    /*
+     * Unpack with sign extension.
+     */
+    int32_t4 unpackedS32 = unpack_s8s32(int8_t4_packed(unpackTestValue));
+    // 0xFFFFFFFF
+    outputBuffer[index++] = uint(unpackedS32.x);
+    // 0xFFFFFF83
+    outputBuffer[index++] = uint(unpackedS32.y);
+    // 0x7A
+    outputBuffer[index++] = uint(unpackedS32.z);
+    // 0xFFFFFFD3
+    outputBuffer[index++] = uint(unpackedS32.w);
+
+#if !defined(WGSL)
+    int16_t4 unpackedS16 = unpack_s8s16(int8_t4_packed(unpackTestValue));
+    // 0xFFFFFFFF
+    outputBuffer[index++] = uint(unpackedS16.x);
+    // 0xFFFFFF83
+    outputBuffer[index++] = uint(unpackedS16.y);
+    // 0x7A
+    outputBuffer[index++] = uint(unpackedS16.z);
+    // 0xFFFFFFD3
+    outputBuffer[index++] = uint(unpackedS16.w);
+#else
+    outputBuffer[index++] = 0xFFFFFFFFU;
+    outputBuffer[index++] = 0xFFFFFF83U;
+    outputBuffer[index++] = 0x7AU;
+    outputBuffer[index++] = 0xFFFFFFD3U;
+#endif
+
+
+    /*
+     * Pack without clamping, dropping unused bits.
+     */
+    uint32_t4 packU32TestValues = packTestBuffer[0];
+    int32_t4 packS32TestValues = packU32TestValues;
+    uint8_t4_packed packU32Result = pack_u8(packU32TestValues);
+    int8_t4_packed packS32Result = pack_s8(packS32TestValues);
+
+    // 0xD4236A3F
+    outputBuffer[index++] = uint(packU32Result);
+    outputBuffer[index++] = uint(packS32Result);
+
+#if !defined(WGSL)
+    uint16_t4 packU16TestValues = int16_t4(int16_t(packU32TestValues.x), int16_t(packU32TestValues.y), 
+                                           int16_t(packU32TestValues.z), int16_t(packU32TestValues.w));
+    int16_t4 packS16TestValues = packU16TestValues;
+    uint8_t4_packed packU16Result = pack_u8(packU16TestValues);
+    int8_t4_packed packS16Result = pack_s8(packS16TestValues);
+
+    outputBuffer[index++] = uint(packU16Result);
+    outputBuffer[index++] = uint(packS16Result);
+#else
+    outputBuffer[index++] = 0xD4236A3F;
+    outputBuffer[index++] = 0xD4236A3F;
+#endif
+
+    /*
+     * Pack with unsigned clamping.
+     */ 
+    int32_t4 packClampU32TestValues = packClampUTestBuffer[0];
+    uint8_t4_packed packClampU32Result = pack_clamp_u8(packClampU32TestValues);
+    // 0xFEFFFF05
+    outputBuffer[index++] = uint(packClampU32Result);
+
+#if !defined(WGSL)
+    int16_t4 packClampU16TestValues = int16_t4(int16_t(packClampU32TestValues.x), int16_t(packClampU32TestValues.y),
+                                               int16_t(packClampU32TestValues.z), int16_t(packClampU32TestValues.w));
+    uint8_t4_packed packClampU16Result = pack_clamp_u8(packClampU16TestValues);
+    outputBuffer[index++] = uint(packClampU16Result);
+#else
+    outputBuffer[index++] = 0xFEFFFF05;
+#endif
+
+    /*
+     * Pack with signed clamping
+     */
+    int32_t4 packClampS32TestValues = packClampSTestBuffer[0];
+    int8_t4_packed packClampS32Result = pack_clamp_s8(packClampS32TestValues);
+    // 0x81807FFF
+    outputBuffer[index++] = uint(packClampS32Result);
+
+#if !defined(WGSL)
+    int16_t4 packClampS16TestValues = int16_t4(int16_t(packClampS32TestValues.x), int16_t(packClampS32TestValues.y),
+                                               int16_t(packClampS32TestValues.z), int16_t(packClampS32TestValues.w));
+    int8_t4_packed packClampS16Result = pack_clamp_s8(packClampS16TestValues);
+    outputBuffer[index++] = uint(packClampS16Result);
+#else
+    outputBuffer[index++] = 0x81807FFF;
+#endif
+}
+
diff --git a/tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt b/tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt
new file mode 100644
index 000000000..0527a39a1
--- /dev/null
+++ b/tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt
@@ -0,0 +1,24 @@
+FF
+83
+7A
+D3
+FF
+83
+7A
+D3
+FFFFFFFF
+FFFFFF83
+7A
+FFFFFFD3
+FFFFFFFF
+FFFFFF83
+7A
+FFFFFFD3
+D4236A3F
+D4236A3F
+D4236A3F
+D4236A3F
+FEFFFF05
+FEFFFF05
+81807FFF
+81807FFF
author	Darren Wihandi <65404740+fairywreath@users.noreply.github.com>	2024-12-28 14:33:16 -0500
committer	GitHub <noreply@github.com>	2024-12-28 19:33:16 +0000
commit	7a6de4aea1973b379d9f3b7db248ad260d3ee024 (patch)
tree	081d4ef6ee9b91f7478ad55c3a6483e4a1004b37
parent	c4429bc33450be32ed82358c3974da58e5ec25ab (diff)