diff options
| author | sricker-nvidia <115114531+sricker-nvidia@users.noreply.github.com> | 2025-04-19 04:33:27 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-04-19 11:33:27 +0000 |
| commit | 043278a527ab5744674417a08d924c67a60a486b (patch) | |
| tree | 19c3ead87def94f2d418926d5f15b9eab1ced440 | |
| parent | 6bfabfee317887e678eed9cd6768df2ffd3b9704 (diff) | |
Implement 64bit countbits intrinsic (#6433) (#6845)
Change modifies the countbits intrinsic to use generics in order to
support 64bit countbits on select platforms where this is supported.
On platforms where this is not natively supported, we emulate by
converting the 64-bit type into a uint2 (metal and spir-v).
This should align with the implementation of other uint64_t
intrinsics such as abs, min, max and clamp.
Added new countbits64 test to verify changes.
Updated documentation for 64bit-type-support.html
| -rw-r--r-- | docs/64bit-type-support.md | 2 | ||||
| -rw-r--r-- | prelude/slang-cpp-scalar-intrinsics.h | 25 | ||||
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 10 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 51 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/countbits64.slang | 44 |
5 files changed, 121 insertions, 11 deletions
diff --git a/docs/64bit-type-support.md b/docs/64bit-type-support.md index 15faccd86..5f54f6601 100644 --- a/docs/64bit-type-support.md +++ b/docs/64bit-type-support.md @@ -147,7 +147,7 @@ D3D12 | FXC/DXBC | No | No | 2 2) uint64_t support requires https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12, so DXBC is not a target. The intrinsics available on `uint64_t` type are `abs`, `min`, `max`, `clamp` and `countbits`. -The intrinsics available on `uint64_t` type are `abs`, `min`, `max` and `clamp`. +The intrinsics available on `int64_t` type are `abs`, `min`, `max`, `clamp` and `countbits`. GLSL ==== diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h index 22b5e12e4..0a19eb327 100644 --- a/prelude/slang-cpp-scalar-intrinsics.h +++ b/prelude/slang-cpp-scalar-intrinsics.h @@ -661,6 +661,23 @@ SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi) return u.d; } +SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v) +{ +#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) + return __builtin_popcount(uint32_t(v)); +#elif SLANG_PROCESSOR_X86_64 && SLANG_VC + return __popcnt(uint32_t(v)); +#else + uint32_t c = 0; + while (v) + { + c++; + v &= v - 1; + } + return c; +#endif +} + // ----------------------------- U32 ----------------------------------------- SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f) @@ -729,9 +746,6 @@ SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b) return a > b ? a : b; } -// TODO(JS): We don't define countbits for 64bit in the core module currently. -// It's not clear from documentation if it should return 32 or 64 bits, if it exists. -// 32 bits can always hold the result, and will be implicitly promoted. SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v) { #if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) @@ -765,6 +779,11 @@ SLANG_FORCE_INLINE int64_t I64_max(int64_t a, int64_t b) return a > b ? a : b; } +SLANG_FORCE_INLINE uint32_t I64_countbits(int64_t v) +{ + return U64_countbits(uint64_t(v)); +} + // ----------------------------- UPTR ----------------------------------------- SLANG_FORCE_INLINE uintptr_t UPTR_abs(uintptr_t f) diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index 5585ad6e0..38e018e3e 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -1823,6 +1823,11 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi) return u.d; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v) +{ + return __popc(uint32_t(v)); +} + // ----------------------------- U32 ----------------------------------------- // Unary @@ -1882,6 +1887,11 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b) return a > b ? a : b; } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I64_countbits(int64_t v) +{ + return __popcll(uint64_t(v)); +} + // ----------------------------- U64 ----------------------------------------- SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index e71997c6c..6b1a4579f 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -8028,7 +8028,8 @@ vector<T,N> cospi(vector<T,N> x) [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -uint countbits(uint value) +__generic<T : __BuiltinIntegerType> +uint countbits(T value) { __target_switch { @@ -8037,22 +8038,42 @@ uint countbits(uint value) case glsl: __intrinsic_asm "bitCount"; case metal: - __intrinsic_asm "popcount"; + if(T is int64_t || T is uint64_t) + { + // emulate 64-bit + uint2 value_uint2 = bit_cast<uint2>(value); + uint2 counted_bits_uint2 = countbits(value_uint2); + return counted_bits_uint2.x + counted_bits_uint2.y; + } + else + { + __intrinsic_asm "popcount"; + } case cuda: case cpp: __intrinsic_asm "$P_countbits($0)"; case spirv: - return spirv_asm {OpBitCount $$uint result $value}; + if(T is int64_t || T is uint64_t) + { + // emulate 64-bit + uint2 value_uint2 = bit_cast<uint2>(value); + uint2 counted_bits_uint2 = countbits(value_uint2); + return counted_bits_uint2.x + counted_bits_uint2.y; + } + else + { + return spirv_asm {OpBitCount $$uint result $value}; + } case wgsl: __intrinsic_asm "countOneBits"; } } -__generic <let N : int> +__generic<T : __BuiltinIntegerType, let N : int> [__readNone] [ForceInline] [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)] -vector<uint, N> countbits(vector<uint, N> value) +vector<uint, N> countbits(vector<T, N> value) { __target_switch { @@ -8061,9 +8082,25 @@ vector<uint, N> countbits(vector<uint, N> value) case glsl: __intrinsic_asm "bitCount"; case metal: - __intrinsic_asm "popcount"; + if(T is int64_t || T is uint64_t) + { + // emulate 64-bit + VECTOR_MAP_UNARY(uint, N, countbits, value); + } + else + { + __intrinsic_asm "popcount"; + } case spirv: - return spirv_asm {OpBitCount $$vector<uint, N> result $value}; + if(T is int64_t || T is uint64_t) + { + // emulate 64-bit + VECTOR_MAP_UNARY(uint, N, countbits, value); + } + else + { + return spirv_asm {OpBitCount $$vector<uint, N> result $value}; + } case wgsl: __intrinsic_asm "countOneBits"; default: diff --git a/tests/hlsl-intrinsic/countbits64.slang b/tests/hlsl-intrinsic/countbits64.slang new file mode 100644 index 000000000..a24b31477 --- /dev/null +++ b/tests/hlsl-intrinsic/countbits64.slang @@ -0,0 +1,44 @@ +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute -render-feature int64 +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute +// No support for uint64_t on fxc - we need SM6.0 and dxil to use uint64_t with d3d12 +// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12 +//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -render-feature hardware-device + +//CHK:1 + +//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +[numthreads(1, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint r1 = countbits(0b1ULL << 32); + uint2 r2 = countbits(uint64_t2(0b0ULL << 32, 0b1ULL << 32)); + uint3 r3 = countbits(uint64_t3(0b0ULL << 32, 0b1ULL << 32, 0b11ULL << 32)); + uint4 r4 = countbits(uint64_t4(0b0ULL << 32, 0b1ULL << 32, 0b11ULL << 32, 0b111ULL << 32)); + + uint r5 = countbits(0b1LL << 32); + uint2 r6 = countbits(int64_t2(0b0LL << 32, 0b1LL << 32)); + uint3 r7 = countbits(int64_t3(0b0LL << 32, 0b1LL << 32, 0b11LL << 32)); + uint4 r8 = countbits(int64_t4(0b0LL << 32, 0b1LL << 32, 0b11LL << 32, 0b111LL << 32)); + + uint bigShiftU32 = 0b111U << 32; + int bigShiftI32 = 0b1111 << 32; + + uint bitCountBigShiftU32 = countbits(bigShiftU32); + uint bitCountBigShiftI32 = countbits(bigShiftI32); + + outputBuffer[0] = true + && (r1 == 1) + && (r2.x == 0 && r2.y == 1) + && (r3.x == 0 && r3.y == 1 && r3.z == 2) + && (r4.x == 0 && r4.y == 1 && r4.z == 2 && r4.w == 3) + && (r5 == 1) + && (r6.x == 0 && r6.y == 1) + && (r7.x == 0 && r7.y == 1 && r7.z == 2) + && (r8.x == 0 && r8.y == 1 && r8.z == 2 && r8.w == 3) + && (bitCountBigShiftU32 == 0 && bitCountBigShiftI32 == 0) + ; +} |
