diff options
| author | Darren Wihandi <65404740+fairywreath@users.noreply.github.com> | 2025-02-28 16:23:29 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-28 13:23:29 -0800 |
| commit | efbfa7832afff7e6285713086259abda2456ed55 (patch) | |
| tree | d5a94eb66867d7f9dc01e4c1a25502443bf71040 /source/slang/hlsl.meta.slang | |
| parent | 618b4c7657f539e66f032cd40554798bc0d68f6d (diff) | |
Add Slang-specific intrinsics for integer pack/unpack (#6459)
* update hlsl meta
* update test
* use slang syntax in meta file
* improve meta file
* fix pack clamp u8
* remove builtin packed types, use typealias instead
* fix wgsl pack clamp
* fix formatting
---------
Co-authored-by: Yong He <yonghe@outlook.com>
Diffstat (limited to 'source/slang/hlsl.meta.slang')
| -rw-r--r-- | source/slang/hlsl.meta.slang | 411 |
1 files changed, 227 insertions, 184 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index c9f3fb533..a671a3dc4 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -17309,8 +17309,8 @@ uint dot4add_u8packed(uint x, uint y, uint acc) result:$$uint = OpIAdd %dotResult $acc; }; default: - uint4 vecX = unpack_u8u32(uint8_t4_packed(x)); - uint4 vecY = unpack_u8u32(uint8_t4_packed(y)); + uint4 vecX = unpackUint4x8ToUint32(x); + uint4 vecY = unpackUint4x8ToUint32(y); return dot(vecX, vecY) + acc; } } @@ -17337,8 +17337,8 @@ int dot4add_i8packed(uint x, uint y, int acc) result:$$int = OpIAdd %dotResult $acc; }; default: - int4 vecX = unpack_s8s32(int8_t4_packed(x)); - int4 vecY = unpack_s8s32(int8_t4_packed(y)); + int4 vecX = unpackInt4x8ToInt32(x); + int4 vecY = unpackInt4x8ToInt32(y); return dot(vecX, vecY) + acc; } } @@ -24035,383 +24035,426 @@ T workgroupUniformLoad<T>(__ref T v) } // -// Pack/Unpack Math Intrinsics +// HLSL Pack/Unpack Math Intrinsics // // These were introduced in SM 6.6 but requirements are dropped to SM 5.0 here // to expose these intrinsics on targets that do not have SM 6.6 features. // -//@hidden: +//@public: + +typealias uint8_t4_packed = uint; +typealias int8_t4_packed = uint; + +/// Unpack 4 signed 8-bit values into a vector of 16 bit integers. [__readNone] [ForceInline] -uint16_t __lsb_as_u16(uint32_t val) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int16_t4 unpack_s8s16(int8_t4_packed packed) { - return uint16_t(val & 0xFFU); + return unpackInt4x8ToInt16(packed); } -//@hidden: +/// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers. [__readNone] [ForceInline] -uint32_t __lsb_as_u32(uint32_t val) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint16_t4 unpack_u8u16(uint8_t4_packed packed) { - return (val & 0xFFU); + return unpackUint4x8ToUint16(packed); } -//@hidden: +/// Unpack 4 signed 8-bit values into a vector of 32 bit integers. [__readNone] [ForceInline] -int8_t __lsb_as_s8(uint32_t val) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int32_t4 unpack_s8s32(int8_t4_packed packed) { - return int8_t(val & 0xFFU); + return unpackInt4x8ToInt32(packed); } -//@hidden: +/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers. [__readNone] [ForceInline] -int16_t __lsb_as_s16(uint32_t val) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint32_t4 unpack_u8u32(uint8_t4_packed packed) { - return int16_t(__lsb_as_s8(val)); + return unpackUint4x8ToUint32(packed); } -//@hidden: +/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. [__readNone] [ForceInline] -int32_t __lsb_as_s32(uint32_t val) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_u8(uint32_t4 unpackedValue) { - return int32_t(__lsb_as_s8(val)); + return packUint4x8(unpackedValue); } -//@hidden: +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_s8(int32_t4 unpackedValue) +{ + return packInt4x8(unpackedValue); +} + +/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_u8(uint16_t4 unpackedValue) +{ + return packUint4x8(unpackedValue); +} + +/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_s8(int16_t4 unpackedValue) +{ + return packInt4x8(unpackedValue); +} + +/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue) +{ + return packUint4x8Clamp(unpackedValue); +} + +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue) +{ + return packInt4x8Clamp(unpackedValue); +} + +/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. [__readNone] [ForceInline] -uint32_t __lsb_clamp_u8_as_u32(int32_t val) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue) { - return clamp(val, 0, 255); + return packUint4x8Clamp(unpackedValue); } +/// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. +[__readNone] +[ForceInline] +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue) +{ + return packInt4x8Clamp(unpackedValue); +} + +// Work-graphs + +//@public: +/// read-only input to Broadcasting launch node. +__generic<T> +//TODO: DispatchNodeInputRecord should be available only for broadcasting node shader. +//[require(broadcasting_node)] +[require(spirv)] +struct DispatchNodeInputRecord +{ + /// Provide an access to a record object that only holds a single record. + NodePayloadPtr<T> Get() + { + int index = 0; + __target_switch + { + case spirv: + return spirv_asm + { + %in_payload_t = OpTypeNodePayloadArrayAMDX $$T; + %in_payload_ptr_t = OpTypePointer NodePayloadAMDX %in_payload_t; + %var = OpVariable %in_payload_ptr_t NodePayloadAMDX; + result : $$NodePayloadPtr<T> = OpAccessChain %var $index; + }; + } + } +}; + +// +// Pack/Unpack Intrinsics +// + //@hidden: + +[__readNone] +[ForceInline] +uint16_t __lsbAsUint16(uint32_t val) +{ + return uint16_t(val & 0xFFU); +} + +[__readNone] +[ForceInline] +uint32_t __lsbAsUint32(uint32_t val) +{ + return (val & 0xFFU); +} + +[__readNone] +[ForceInline] +int8_t __lsbAsInt8(uint32_t val) +{ + return int8_t(val); +} + [__readNone] [ForceInline] -uint32_t __lsb_clamp_s8_as_u32(int32_t val) +int16_t __lsbAsInt16(uint32_t val) { - return (uint32_t(clamp(val, -128, 127)) & 0xFFU); + return int16_t(__lsbAsInt8(val)); +} + +[__readNone] +[ForceInline] +int32_t __lsbAsInt32(uint32_t val) +{ + return int32_t(__lsbAsInt8(val)); } //@public: -/// Unpack 4 signed 8-bit values into a vector of 16 bit integers. + +/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int16_t4 unpack_s8s16(int8_t4_packed packed) +[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] +uint32_t4 unpackUint4x8ToUint32(uint packedValue) { __target_switch { - case hlsl: __intrinsic_asm "unpack_s8s16"; - case spirv: + case hlsl: __intrinsic_asm "unpack_u8u32"; + case wgsl: __intrinsic_asm "unpack4xU8"; + case spirv: return spirv_asm { - %s8Vec = OpBitcast $$vector<int8_t, 4> $packed; - result:$$vector<int16_t, 4> = OpSConvert %s8Vec + %u8Vec = OpBitcast $$vector<uint8_t, 4> $packedValue; + result:$$vector<uint32_t, 4> = OpUConvert %u8Vec }; default: - uint32_t packedValue = uint32_t(packed); - return int16_t4 + return uint32_t4 ( - __lsb_as_s16(packedValue), - __lsb_as_s16(packedValue >> 8U), - __lsb_as_s16(packedValue >> 16U), - __lsb_as_s16(packedValue >> 24U), + __lsbAsUint32(packedValue), + __lsbAsUint32(packedValue >> 8U), + __lsbAsUint32(packedValue >> 16U), + uint32_t(packedValue >> 24U), ); } } -//@public: /// Unpack 4 unsigned 8-bit values into a vector of 16 bit integers. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint16_t4 unpack_u8u16(uint8_t4_packed packed) +uint16_t4 unpackUint4x8ToUint16(uint packedValue) { __target_switch { case hlsl: __intrinsic_asm "unpack_u8u16"; - case spirv: + case spirv: return spirv_asm { - %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed; + %u8Vec = OpBitcast $$vector<uint8_t, 4> $packedValue; result:$$vector<uint16_t, 4> = OpUConvert %u8Vec }; default: - uint32_t packedValue = uint32_t(packed); return uint16_t4 ( - __lsb_as_u16(packedValue), - __lsb_as_u16(packedValue >> 8U), - __lsb_as_u16(packedValue >> 16U), - __lsb_as_u16(packedValue >> 24U), + __lsbAsUint16(packedValue), + __lsbAsUint16(packedValue >> 8U), + __lsbAsUint16(packedValue >> 16U), + uint16_t(packedValue >> 24U), ); } } -//@public: /// Unpack 4 signed 8-bit values into a vector of 32 bit integers. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int32_t4 unpack_s8s32(int8_t4_packed packed) +int32_t4 unpackInt4x8ToInt32(uint packedValue) { __target_switch { case hlsl: __intrinsic_asm "unpack_s8s32"; case wgsl: __intrinsic_asm "unpack4xI8"; - case spirv: + case spirv: return spirv_asm { - %s8Vec = OpBitcast $$vector<int8_t, 4> $packed; + %s8Vec = OpBitcast $$vector<int8_t, 4> $packedValue; result:$$vector<int32_t, 4> = OpSConvert %s8Vec }; default: - uint32_t packedValue = uint32_t(packed); return int32_t4 ( - __lsb_as_s32(packedValue), - __lsb_as_s32(packedValue >> 8U), - __lsb_as_s32(packedValue >> 16U), - __lsb_as_s32(packedValue >> 24U), + __lsbAsInt32(packedValue), + __lsbAsInt32(packedValue >> 8U), + __lsbAsInt32(packedValue >> 16U), + int32_t(int8_t(packedValue >> 24U)), ); } } -//@public: -/// Unpack 4 unsigned 8-bit values into a vector of 32 bit integers. +/// Unpack 4 signed 8-bit values into a vector of 16 bit integers. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint32_t4 unpack_u8u32(uint8_t4_packed packed) +int16_t4 unpackInt4x8ToInt16(uint packedValue) { __target_switch { - case hlsl: __intrinsic_asm "unpack_u8u32"; - case wgsl: __intrinsic_asm "unpack4xU8"; - case spirv: + case hlsl: __intrinsic_asm "unpack_s8s16"; + case spirv: return spirv_asm { - %u8Vec = OpBitcast $$vector<uint8_t, 4> $packed; - result:$$vector<uint32_t, 4> = OpUConvert %u8Vec + %s8Vec = OpBitcast $$vector<int8_t, 4> $packedValue; + result:$$vector<int16_t, 4> = OpSConvert %s8Vec }; default: - uint32_t packedValue = uint32_t(packed); - return uint32_t4 + return int16_t4 ( - __lsb_as_u32(packedValue), - __lsb_as_u32(packedValue >> 8U), - __lsb_as_u32(packedValue >> 16U), - __lsb_as_u32(packedValue >> 24U), + __lsbAsInt16(packedValue), + __lsbAsInt16(packedValue >> 8U), + __lsbAsInt16(packedValue >> 16U), + int16_t(int8_t(packedValue >> 24U)), ); } } -//@public: /// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint8_t4_packed pack_u8(uint32_t4 unpackedValue) +uint packUint4x8(uint32_t4 unpackedValue) { __target_switch { case hlsl: __intrinsic_asm "pack_u8"; case wgsl: __intrinsic_asm "pack4xU8"; default: - return uint8_t4_packed - ( - __lsb_as_u32(unpackedValue.x) - | (__lsb_as_u32(unpackedValue.y) << 8U) - | (__lsb_as_u32(unpackedValue.z) << 16U) - | (__lsb_as_u32(unpackedValue.w) << 24U) - ); + return __lsbAsUint32(unpackedValue.x) + | (__lsbAsUint32(unpackedValue.y) << 8U) + | (__lsbAsUint32(unpackedValue.z) << 16U) + | (unpackedValue.w << 24U); } } -//@public: -/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int8_t4_packed pack_s8(int32_t4 unpackedValue) +uint packUint4x8(uint16_t4 unpackedValue) { __target_switch { - case hlsl: __intrinsic_asm "pack_s8"; - case wgsl: __intrinsic_asm "pack4xI8"; + case hlsl: __intrinsic_asm "pack_u8"; default: - return int8_t4_packed - ( - __lsb_as_u32(unpackedValue.x) - | (__lsb_as_u32(unpackedValue.y) << 8U) - | (__lsb_as_u32(unpackedValue.z) << 16U) - | (__lsb_as_u32(unpackedValue.w) << 24U) - ); + return packUint4x8(uint32_t4(unpackedValue)); } } -//@public: -/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, dropping unused bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint8_t4_packed pack_u8(uint16_t4 unpackedValue) +uint packInt4x8(int32_t4 unpackedValue) { __target_switch { - case hlsl: __intrinsic_asm "pack_u8"; + case hlsl: __intrinsic_asm "pack_s8"; + case wgsl: __intrinsic_asm "pack4xI8"; default: - return uint8_t4_packed - ( - __lsb_as_u32(unpackedValue.x) - | (__lsb_as_u32(unpackedValue.y) << 8U) - | (__lsb_as_u32(unpackedValue.z) << 16U) - | (__lsb_as_u32(unpackedValue.w) << 24U) - ); + return packUint4x8(uint32_t4(unpackedValue)); } } -//@public: /// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, dropping unused bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int8_t4_packed pack_s8(int16_t4 unpackedValue) +uint packInt4x8(int16_t4 unpackedValue) { __target_switch { case hlsl: __intrinsic_asm "pack_s8"; default: - return int8_t4_packed - ( - __lsb_as_u32(unpackedValue.x) - | (__lsb_as_u32(unpackedValue.y) << 8U) - | (__lsb_as_u32(unpackedValue.z) << 16U) - | (__lsb_as_u32(unpackedValue.w) << 24U) - ); + return packUint4x8(uint32_t4(unpackedValue)); } } -//@public: -/// Pack a vector of 4 unsigned 32 bit integers into a packed value of 4 8-bit integers, -/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint8_t4_packed pack_clamp_u8(int32_t4 unpackedValue) +uint packUint4x8Clamp(int32_t4 unpackedValue) { __target_switch { case hlsl: __intrinsic_asm "pack_clamp_u8"; case wgsl: __intrinsic_asm "pack4xU8Clamp(vec4<u32>($0))"; default: - return uint8_t4_packed - ( - __lsb_clamp_u8_as_u32(unpackedValue.x) - | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U) - | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U) - | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U) - ); + return packInt4x8(clamp(unpackedValue, 0, 255)); } } -//@public: -/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, -/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. +/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int8_t4_packed pack_clamp_s8(int32_t4 unpackedValue) +uint packUint4x8Clamp(int16_t4 unpackedValue) { __target_switch { - case hlsl: __intrinsic_asm "pack_clamp_s8"; - case wgsl: __intrinsic_asm "pack4xI8Clamp"; + case hlsl: __intrinsic_asm "pack_clamp_u8"; default: - return int8_t4_packed - ( - __lsb_clamp_s8_as_u32(unpackedValue.x) - | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U) - | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U) - | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U) - ); + return packInt4x8(clamp(unpackedValue, 0, 255)); } } -//@public: -/// Pack a vector of 4 unsigned 16 bit integers into a packed value of 4 8-bit integers, -/// clamping each value to the range [0, 255] to ensure it fits within 8 bits. +/// Pack a vector of 4 signed 32 bit integers into a packed value of 4 8-bit integers, +/// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint8_t4_packed pack_clamp_u8(int16_t4 unpackedValue) +uint packInt4x8Clamp(int32_t4 unpackedValue) { __target_switch { - case hlsl: __intrinsic_asm "pack_clamp_u8"; + case hlsl: __intrinsic_asm "pack_clamp_s8"; + case wgsl: __intrinsic_asm "pack4xI8Clamp"; default: - return uint8_t4_packed - ( - __lsb_clamp_u8_as_u32(unpackedValue.x) - | (__lsb_clamp_u8_as_u32(unpackedValue.y) << 8U) - | (__lsb_clamp_u8_as_u32(unpackedValue.z) << 16U) - | (__lsb_clamp_u8_as_u32(unpackedValue.w) << 24U) - ); + return packInt4x8(clamp(unpackedValue, -128, 127)); } } -//@public: /// Pack a vector of 4 signed 16 bit integers into a packed value of 4 8-bit integers, /// clamping each value to the range [-128, 127] to ensure it fits within 8 bits. [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -int8_t4_packed pack_clamp_s8(int16_t4 unpackedValue) +uint packInt4x8Clamp(int16_t4 unpackedValue) { __target_switch { case hlsl: __intrinsic_asm "pack_clamp_s8"; default: - return int8_t4_packed - ( - __lsb_clamp_s8_as_u32(unpackedValue.x) - | (__lsb_clamp_s8_as_u32(unpackedValue.y) << 8U) - | (__lsb_clamp_s8_as_u32(unpackedValue.z) << 16U) - | (__lsb_clamp_s8_as_u32(unpackedValue.w) << 24U) - ); + return packInt4x8(clamp(unpackedValue, -128, 127)); } } - -// Work-graphs - -//@public: -/// read-only input to Broadcasting launch node. -__generic<T> -//TODO: DispatchNodeInputRecord should be available only for broadcasting node shader. -//[require(broadcasting_node)] -[require(spirv)] -struct DispatchNodeInputRecord -{ - /// Provide an access to a record object that only holds a single record. - NodePayloadPtr<T> Get() - { - int index = 0; - __target_switch - { - case spirv: - return spirv_asm - { - %in_payload_t = OpTypeNodePayloadArrayAMDX $$T; - %in_payload_ptr_t = OpTypePointer NodePayloadAMDX %in_payload_t; - %var = OpVariable %in_payload_ptr_t NodePayloadAMDX; - result : $$NodePayloadPtr<T> = OpAccessChain %var $index; - }; - } - } -}; - |
