diff options
| author | Darren Wihandi <65404740+fairywreath@users.noreply.github.com> | 2024-12-28 14:33:16 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-12-28 19:33:16 +0000 |
| commit | 7a6de4aea1973b379d9f3b7db248ad260d3ee024 (patch) | |
| tree | 081d4ef6ee9b91f7478ad55c3a6483e4a1004b37 | |
| parent | c4429bc33450be32ed82358c3974da58e5ec25ab (diff) | |
Implement HLSL pack/unpack math intrinsics (#5934)
| -rw-r--r-- | source/slang/hlsl.meta.slang | 354 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/packed/pack-unpack.slang | 158 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt | 24 |
3 files changed, 536 insertions, 0 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 85a81eabf..774c6a247 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -20806,3 +20806,357 @@ T workgroupUniformLoad<T>(__ref T v) return v; } } + +// +// Pack/Unpack Math Intrinsics +// +// These were introduced in SM 6.6 but requirements are dropped to SM 5.0 here +// to expose these intrinsics on targets that do not have SM 6.6 features. +// + +//@hidden: +[__readNone] +[ForceInline] +uint16_t __lsb_as_u16(uint32_t val) +{ + return uint16_t(val & 0xFFU); +} + +//@hidden: +[__readNone] +[ForceInline] +uint32_t __lsb_as_u32(uint32_t val) +{ + return (val & 0xFFU); +} + +//@hidden: +[__readNone] +[ForceInline] +int8_t __lsb_as_s8(uint32_t val) +{ + return int8_t(val & 0xFFU); +} + +//@hidden: +[__readNone] +[ForceInline] +int16_t __lsb_as_s16(uint32_t val) +{ + return int16_t(__lsb_as_s8(val)); +} + +//@hidden: +[__readNone] +[ForceInline] +int32_t __lsb_as_s32(uint32_t val) +{ + return int32_t(__lsb_as_s8(val)); +} + +//@hidden: +[__readNone] +[ForceInline] +uint32_t __lsb_clamp_u8_as_u32(int32_t val) +{ + return clamp(val, 0, 255); +} + +//@hidden: +[__readNone] +[ForceInline] +uint32_t __lsb_clamp_s8_as_u32(int32_t val) +{ + return (uint32_t(clamp(val, -128, 127)) & 0xFFU); +} + +//@public: +/// Unpack 4 signed 8-bit values into a vector of 16 bit integers. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int16_t4 unpack_s8s16(int8_t4_packed packed) +{ + __target_switch + { + case hlsl: __intrinsic_asm "unpack_s8s16"; + case spirv: + return spirv_asm + { + %s8Vec = OpBitcast $$vector<int8_t, 4> $packed; + result:$$vector<int16_t, 4> = OpSConvert %s8Vec + }; + default: + uint32_t packedValue = uint32_t(packed); + return int16_t4 + ( + __lsb_as_s16(packedValue), + __lsb_as_s16(packedValue >> 8U), + __lsb_as_s16(packedValue >> 16U), + __lsb_as_s16(packedValue >> 24U), + ); + } +} + +//@public: +/// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint16_t4 unpack_u8u16(uint8_t4_packed packed) +{ + __target_switch + { + case hlsl: __intrinsic_asm "unpack_u8u16"; + case spirv: + return spirv_asm + { + %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed; + result:$$vector<uint16_t, 4> = OpUConvert %u8Vec + }; + default: + uint32_t packedValue = uint32_t(packed); + return uint16_t4 + ( + __lsb_as_u16(packedValue), + __lsb_as_u16(packedValue >> 8U), + __lsb_as_u16(packedValue >> 16U), + __lsb_as_u16(packedValue >> 24U), + ); + } +} + +//@public: +/// Unpack 4 signed 8-bit values into a vector of 32 bit integers. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int32_t4 unpack_s8s32(int8_t4_packed packed) +{ + __target_switch + { + case hlsl: __intrinsic_asm "unpack_s8s32"; + case wgsl: __intrinsic_asm "unpack4xI8"; + case spirv: + return spirv_asm + { + %s8Vec = OpBitcast $$vector<int8_t, 4> $packed; + result:$$vector<int32_t, 4> = OpSConvert %s8Vec + }; + default: + uint32_t packedValue = uint32_t(packed); + return int32_t4 + ( + __lsb_as_s32(packedValue), + __lsb_as_s32(packedValue >> 8U), + __lsb_as_s32(packedValue >> 16U), + __lsb_as_s32(packedValue >> 24U), + ); + } +} + +//@public: +/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint32_t4 unpack_u8u32(uint8_t4_packed packed) +{ + __target_switch + { + case hlsl: __intrinsic_asm "unpack_u8u32"; + case wgsl: __intrinsic_asm "unpack4xU8"; + case spirv: + return spirv_asm + { + %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed; + result:$$vector<uint32_t, 4> = OpUConvert %u8Vec + }; + default: + uint32_t packedValue = uint32_t(packed); + return uint32_t4 + ( + __lsb_as_u32(packedValue), + __lsb_as_u32(packedValue >> 8U), + __lsb_as_u32(packedValue >> 16U), + __lsb_as_u32(packedValue >> 24U), + ); + } +} + +//@public: +/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_u8(uint32_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_u8"; + case wgsl: __intrinsic_asm "pack4xU8"; + default: + return uint8_t4_packed + ( + __lsb_as_u32(unpackedValue.x) + | (__lsb_as_u32(unpackedValue.y) << 8U) + | (__lsb_as_u32(unpackedValue.z) << 16U) + | (__lsb_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_s8(int32_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_s8"; + case wgsl: __intrinsic_asm "pack4xI8"; + default: + return int8_t4_packed + ( + __lsb_as_u32(unpackedValue.x) + | (__lsb_as_u32(unpackedValue.y) << 8U) + | (__lsb_as_u32(unpackedValue.z) << 16U) + | (__lsb_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_u8(uint16_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_u8"; + default: + return uint8_t4_packed + ( + __lsb_as_u32(unpackedValue.x) + | (__lsb_as_u32(unpackedValue.y) << 8U) + | (__lsb_as_u32(unpackedValue.z) << 16U) + | (__lsb_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_s8(int16_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_s8"; + default: + return int8_t4_packed + ( + __lsb_as_u32(unpackedValue.x) + | (__lsb_as_u32(unpackedValue.y) << 8U) + | (__lsb_as_u32(unpackedValue.z) << 16U) + | (__lsb_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_clamp_u8"; + case wgsl: __intrinsic_asm "pack4xU8Clamp(vec4<u32>($0))"; + default: + return uint8_t4_packed + ( + __lsb_clamp_u8_as_u32(unpackedValue.x) + | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U) + | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U) + | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_clamp_s8"; + case wgsl: __intrinsic_asm "pack4xI8Clamp"; + default: + return int8_t4_packed + ( + __lsb_clamp_s8_as_u32(unpackedValue.x) + | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U) + | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U) + | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_clamp_u8"; + default: + return uint8_t4_packed + ( + __lsb_clamp_u8_as_u32(unpackedValue.x) + | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U) + | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U) + | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U) + ); + } +} + +//@public: +/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue) +{ + __target_switch + { + case hlsl: __intrinsic_asm "pack_clamp_s8"; + default: + return int8_t4_packed + ( + __lsb_clamp_s8_as_u32(unpackedValue.x) + | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U) + | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U) + | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U) + ); + } +} + diff --git a/tests/hlsl-intrinsic/packed/pack-unpack.slang b/tests/hlsl-intrinsic/packed/pack-unpack.slang new file mode 100644 index 000000000..b20e69fa8 --- /dev/null +++ b/tests/hlsl-intrinsic/packed/pack-unpack.slang @@ -0,0 +1,158 @@ +//TEST(compute):COMPARE_COMPUTE_EX:-vk -compute -shaderobj +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -render-feature hardware-device +//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -profile cs_6_6 -dx12 -use-dxil -shaderobj -render-feature hardware-device +//TEST(compute):COMPARE_COMPUTE_EX:-metal -compute -shaderobj +//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -shaderobj + +// 16 bit variants are not supported by WGSL. +//TEST(compute):COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj -xslang -DWGSL +// Debug info for inlining errors can be given out, so disable them for this test. +//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -g0 + +//TEST_INPUT:ubuffer(data=[0xD37A83FF], stride=4):name unpackTestBuffer +StructuredBuffer<uint32_t> unpackTestBuffer; + +//TEST_INPUT:ubuffer(data=[0xB3F 0x6A 0x123 0xD4], stride=4): name packTestBuffer +StructuredBuffer<uint32_t4> packTestBuffer; + +// These should clamp to (5, 255, 255, 254) or (0x5, 0xFF, 0xFF, 0xFE). +//TEST_INPUT:ubuffer(data=[5 256 12345 254], stride=4): name packClampUTestBuffer +StructuredBuffer<int32_t4> packClampUTestBuffer; + +// These should clamp to (-1, 127, -128, 125) or (0xFF, 0x7F, 0x80, 0x81) +// Inputs are [-1 250 -32768 -127]. +//TEST_INPUT:ubuffer(data=[0xFFFFFFFF 0xFA 0xFFFF8000 0xFFFFFF81], stride=4): name packClampSTestBuffer +StructuredBuffer<int32_t4> packClampSTestBuffer; + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +[numthreads(1, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + const uint8_t4_packed unpackTestValue = uint8_t4_packed(unpackTestBuffer[0]); + + uint index = 0; + + /* + * Unpack without sign extension. + */ + uint32_t4 unpackedU32 = unpack_u8u32(unpackTestValue); + // 0xFF + outputBuffer[index++] = uint(unpackedU32.x); + // 0x83 + outputBuffer[index++] = uint(unpackedU32.y); + // 0x7A + outputBuffer[index++] = uint(unpackedU32.z); + // 0xD3 + outputBuffer[index++] = uint(unpackedU32.w); + +#if !defined(WGSL) + uint16_t4 unpackedU16 = unpack_u8u16(unpackTestValue); + // 0xFF + outputBuffer[index++] = uint(unpackedU16.x); + // 0x83 + outputBuffer[index++] = uint(unpackedU16.y); + // 0x7A + outputBuffer[index++] = uint(unpackedU16.z); + // 0xD3 + outputBuffer[index++] = uint(unpackedU16.w); +#else + outputBuffer[index++] = 0xFFU; + outputBuffer[index++] = 0x83U; + outputBuffer[index++] = 0x7AU; + outputBuffer[index++] = 0xD3U; +#endif + + /* + * Unpack with sign extension. + */ + int32_t4 unpackedS32 = unpack_s8s32(int8_t4_packed(unpackTestValue)); + // 0xFFFFFFFF + outputBuffer[index++] = uint(unpackedS32.x); + // 0xFFFFFF83 + outputBuffer[index++] = uint(unpackedS32.y); + // 0x7A + outputBuffer[index++] = uint(unpackedS32.z); + // 0xFFFFFFD3 + outputBuffer[index++] = uint(unpackedS32.w); + +#if !defined(WGSL) + int16_t4 unpackedS16 = unpack_s8s16(int8_t4_packed(unpackTestValue)); + // 0xFFFFFFFF + outputBuffer[index++] = uint(unpackedS16.x); + // 0xFFFFFF83 + outputBuffer[index++] = uint(unpackedS16.y); + // 0x7A + outputBuffer[index++] = uint(unpackedS16.z); + // 0xFFFFFFD3 + outputBuffer[index++] = uint(unpackedS16.w); +#else + outputBuffer[index++] = 0xFFFFFFFFU; + outputBuffer[index++] = 0xFFFFFF83U; + outputBuffer[index++] = 0x7AU; + outputBuffer[index++] = 0xFFFFFFD3U; +#endif + + + /* + * Pack without clamping, dropping unused bits. + */ + uint32_t4 packU32TestValues = packTestBuffer[0]; + int32_t4 packS32TestValues = packU32TestValues; + uint8_t4_packed packU32Result = pack_u8(packU32TestValues); + int8_t4_packed packS32Result = pack_s8(packS32TestValues); + + // 0xD4236A3F + outputBuffer[index++] = uint(packU32Result); + outputBuffer[index++] = uint(packS32Result); + +#if !defined(WGSL) + uint16_t4 packU16TestValues = int16_t4(int16_t(packU32TestValues.x), int16_t(packU32TestValues.y), + int16_t(packU32TestValues.z), int16_t(packU32TestValues.w)); + int16_t4 packS16TestValues = packU16TestValues; + uint8_t4_packed packU16Result = pack_u8(packU16TestValues); + int8_t4_packed packS16Result = pack_s8(packS16TestValues); + + outputBuffer[index++] = uint(packU16Result); + outputBuffer[index++] = uint(packS16Result); +#else + outputBuffer[index++] = 0xD4236A3F; + outputBuffer[index++] = 0xD4236A3F; +#endif + + /* + * Pack with unsigned clamping. + */ + int32_t4 packClampU32TestValues = packClampUTestBuffer[0]; + uint8_t4_packed packClampU32Result = pack_clamp_u8(packClampU32TestValues); + // 0xFEFFFF05 + outputBuffer[index++] = uint(packClampU32Result); + +#if !defined(WGSL) + int16_t4 packClampU16TestValues = int16_t4(int16_t(packClampU32TestValues.x), int16_t(packClampU32TestValues.y), + int16_t(packClampU32TestValues.z), int16_t(packClampU32TestValues.w)); + uint8_t4_packed packClampU16Result = pack_clamp_u8(packClampU16TestValues); + outputBuffer[index++] = uint(packClampU16Result); +#else + outputBuffer[index++] = 0xFEFFFF05; +#endif + + /* + * Pack with signed clamping + */ + int32_t4 packClampS32TestValues = packClampSTestBuffer[0]; + int8_t4_packed packClampS32Result = pack_clamp_s8(packClampS32TestValues); + // 0x81807FFF + outputBuffer[index++] = uint(packClampS32Result); + +#if !defined(WGSL) + int16_t4 packClampS16TestValues = int16_t4(int16_t(packClampS32TestValues.x), int16_t(packClampS32TestValues.y), + int16_t(packClampS32TestValues.z), int16_t(packClampS32TestValues.w)); + int8_t4_packed packClampS16Result = pack_clamp_s8(packClampS16TestValues); + outputBuffer[index++] = uint(packClampS16Result); +#else + outputBuffer[index++] = 0x81807FFF; +#endif +} + diff --git a/tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt b/tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt new file mode 100644 index 000000000..0527a39a1 --- /dev/null +++ b/tests/hlsl-intrinsic/packed/pack-unpack.slang.expected.txt @@ -0,0 +1,24 @@ +FF +83 +7A +D3 +FF +83 +7A +D3 +FFFFFFFF +FFFFFF83 +7A +FFFFFFD3 +FFFFFFFF +FFFFFF83 +7A +FFFFFFD3 +D4236A3F +D4236A3F +D4236A3F +D4236A3F +FEFFFF05 +FEFFFF05 +81807FFF +81807FFF |
