diff options
| author | Copilot <198982749+Copilot@users.noreply.github.com> | 2025-07-28 22:35:14 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-07-29 05:35:14 +0000 |
| commit | e8797496bf7caeb08b5334d13d0d5aa039106210 (patch) | |
| tree | fdfd68a6d9122c43523c37b5b20893d11aad1c06 | |
| parent | a2d93ae0122aff62866e9266e5e51d4615a00783 (diff) | |
Fix CUDA backend missing U32_firstbitlow implementation (#7921)
* Initial plan
* Add U32_firstbitlow implementation for CUDA and CPP backends
Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com>
* Add I32_firstbitlow and comprehensive testing for signed/unsigned firstbitlow
Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com>
* Convert firstbitlow test to use inline filecheck syntax
Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com>
* Add U32_firstbithigh and I32_firstbithigh implementations for CUDA and CPP backends
Co-authored-by: csyonghe <2652293+csyonghe@users.noreply.github.com>
* Update prelude/slang-cpp-scalar-intrinsics.h
* Update prelude/slang-cpp-scalar-intrinsics.h
* Update prelude/slang-cpp-scalar-intrinsics.h
* Refactor Metal bit intrinsics to handle zero case correctly
Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com>
* Update slang-cuda-prelude.h
remove fake links
* Update hlsl.meta.slang
* if -1, return -1 due to implicit hlsl rule
* -1 or 0 is ~0u as per hlsl implictly
* 0 or -1 as per hlsl
* fix the math to map to hlsl
* fix compile error
* forgot `31 - clz`
* format code (#7943)
Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
* Update source/slang/hlsl.meta.slang
* Update source/slang/hlsl.meta.slang
* Update source/slang/hlsl.meta.slang
* Update source/slang/hlsl.meta.slang
---------
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com>
Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com>
Co-authored-by: csyonghe <2652293+csyonghe@users.noreply.github.com>
Co-authored-by: ArielG-NV <aglasroth@nvidia.com>
Co-authored-by: slangbot <ellieh+slangbot@nvidia.com>
Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
| -rw-r--r-- | prelude/slang-cpp-scalar-intrinsics.h | 54 | ||||
| -rw-r--r-- | prelude/slang-cuda-prelude.h | 29 | ||||
| -rw-r--r-- | source/slang/hlsl.meta.slang | 59 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/firstbithigh.slang | 37 | ||||
| -rw-r--r-- | tests/hlsl-intrinsic/firstbitlow.slang | 39 |
5 files changed, 208 insertions, 10 deletions
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h index 9b045941a..731fd02be 100644 --- a/prelude/slang-cpp-scalar-intrinsics.h +++ b/prelude/slang-cpp-scalar-intrinsics.h @@ -717,6 +717,50 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v) #endif } +SLANG_FORCE_INLINE uint32_t U32_firstbitlow(uint32_t v) +{ + if (v == 0) + return ~0u; + +#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) + // __builtin_ctz returns number of trailing zeros, which is the 0-based index of first set bit + return __builtin_ctz(v); +#elif SLANG_PROCESSOR_X86_64 && SLANG_VC + // _BitScanForward returns 1 on success, 0 on failure, and sets index + unsigned long index; + return _BitScanForward(&index, v) ? index : ~0u; +#else + // Generic implementation - find first set bit + uint32_t result = 0; + while (result < 32 && !(v & (1u << result))) + result++; + return result; +#endif +} + +SLANG_FORCE_INLINE uint32_t U32_firstbithigh(uint32_t v) +{ + if ((int32_t)v < 0) + v = ~v; + if (v == 0) + return ~0u; +#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) + // __builtin_clz returns number of leading zeros + // firstbithigh should return 0-based bit position of MSB + return 31 - __builtin_clz(v); +#elif SLANG_PROCESSOR_X86_64 && SLANG_VC + // _BitScanReverse returns 1 on success, 0 on failure, and sets index + unsigned long index; + return _BitScanReverse(&index, v) ? index : ~0u; +#else + // Generic implementation - find highest set bit + int result = 31; + while (result >= 0 && !(v & (1u << result))) + result--; + return result; +#endif +} + // ----------------------------- I32 ----------------------------------------- SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) @@ -755,6 +799,16 @@ SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v) return U32_countbits(uint32_t(v)); } +SLANG_FORCE_INLINE uint32_t I32_firstbitlow(int32_t v) +{ + return U32_firstbitlow(uint32_t(v)); +} + +SLANG_FORCE_INLINE uint32_t I32_firstbithigh(int32_t v) +{ + return U32_firstbithigh(uint32_t(v)); +} + // ----------------------------- U64 ----------------------------------------- SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index fd79b77aa..3ebdbe777 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -2081,10 +2081,26 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v) { - // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46 return __popc(v); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_firstbitlow(uint32_t v) +{ + // __ffs returns 1-based bit position or 0 if no bits set + // firstbitlow should return 0-based bit position or ~0u if no bits set + return v == 0 ? ~0u : (__ffs(v) - 1); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_firstbithigh(uint32_t v) +{ + // maps to hlsl firstbithigh + if ((int32_t)v < 0) + v = ~v; + if (v == 0) + return ~0u; + return 31 - __clz(v); +} + // ----------------------------- I32 ----------------------------------------- // Unary @@ -2125,6 +2141,16 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v) return U32_countbits(uint32_t(v)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_firstbitlow(int32_t v) +{ + return U32_firstbitlow(uint32_t(v)); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_firstbithigh(int32_t v) +{ + return U32_firstbithigh(uint32_t(v)); +} + // ----------------------------- U64 ----------------------------------------- SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) @@ -2143,7 +2169,6 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v) { - // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46 return __popcll(v); } diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index 2ac886f61..66d1cb5e6 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -9539,6 +9539,39 @@ vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng) } } +// Helper functions for Metal target +internal int __metal_clz(int value) +{ + __target_switch + { + case metal: __intrinsic_asm "clz"; + } +} + +internal uint __metal_clz(uint value) +{ + __target_switch + { + case metal: __intrinsic_asm "clz"; + } +} + +internal int __metal_ctz(int value) +{ + __target_switch + { + case metal: __intrinsic_asm "ctz"; + } +} + +internal uint __metal_ctz(uint value) +{ + __target_switch + { + case metal: __intrinsic_asm "ctz"; + } +} + /// Find first set bit starting at high bit and working down. /// @param value The value to find set bits in. /// @return The bit index number of the most significant bit, @@ -9559,7 +9592,14 @@ int firstbithigh(int value) case cuda: __intrinsic_asm "$P_firstbithigh($0)"; case glsl: __intrinsic_asm "findMSB"; case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; + case metal: + { + if ((int)value < 0) + value = ~value; + if (value == 0) + return ~0u; + return 31 - __metal_clz(value); + } case spirv: return spirv_asm { OpExtInst $$int result glsl450 FindSMsb $value }; @@ -9576,7 +9616,6 @@ vector<int, N> firstbithigh(vector<int, N> value) { case glsl: __intrinsic_asm "findMSB"; case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; case spirv: return spirv_asm { OpExtInst $$vector<int, N> result glsl450 FindSMsb $value }; @@ -9596,7 +9635,14 @@ uint firstbithigh(uint value) case cuda: __intrinsic_asm "$P_firstbithigh($0)"; case glsl: __intrinsic_asm "findMSB"; case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; + case metal: + { + if ((int)value < 0) + value = ~value; + if (value == 0) + return ~0u; + return 31 - __metal_clz(value); + } case spirv: return spirv_asm { OpExtInst $$uint result glsl450 FindUMsb $value }; @@ -9613,7 +9659,6 @@ vector<uint,N> firstbithigh(vector<uint,N> value) { case glsl: __intrinsic_asm "findMSB"; case hlsl: __intrinsic_asm "firstbithigh"; - case metal: __intrinsic_asm "clz"; case spirv: return spirv_asm { OpExtInst $$vector<uint,N> result glsl450 FindUMsb $value }; @@ -9639,7 +9684,7 @@ int firstbitlow(int value) case cuda: __intrinsic_asm "$P_firstbitlow($0)"; case glsl: __intrinsic_asm "findLSB"; case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; + case metal: return (value==0) ? -1 : __metal_ctz(value); case spirv: return spirv_asm { OpExtInst $$int result glsl450 FindILsb $value }; @@ -9656,7 +9701,6 @@ vector<int,N> firstbitlow(vector<int,N> value) { case glsl: __intrinsic_asm "findLSB"; case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; case spirv: return spirv_asm { OpExtInst $$vector<int,N> result glsl450 FindILsb $value }; @@ -9676,7 +9720,7 @@ uint firstbitlow(uint value) case cuda: __intrinsic_asm "$P_firstbitlow($0)"; case glsl: __intrinsic_asm "findLSB"; case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; + case metal: return (value==0) ? -1 : __metal_ctz(value); case spirv: return spirv_asm { OpExtInst $$uint result glsl450 FindILsb $value }; @@ -9693,7 +9737,6 @@ vector<uint,N> firstbitlow(vector<uint,N> value) { case glsl: __intrinsic_asm "findLSB"; case hlsl: __intrinsic_asm "firstbitlow"; - case metal: __intrinsic_asm "ctz"; case spirv: return spirv_asm { OpExtInst $$vector<uint,N> result glsl450 FindILsb $value }; diff --git a/tests/hlsl-intrinsic/firstbithigh.slang b/tests/hlsl-intrinsic/firstbithigh.slang new file mode 100644 index 000000000..f5b0bc038 --- /dev/null +++ b/tests/hlsl-intrinsic/firstbithigh.slang @@ -0,0 +1,37 @@ +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cpu -compute -shaderobj +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -shaderobj +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -dx12 -shaderobj +//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -compute -shaderobj +//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cuda -compute -shaderobj + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +[numthreads(1, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + bool result = true; + result = result && firstbithigh(0) == -1; + result = result && firstbithigh(1) == 0; + result = result && firstbithigh(2) == 1; + result = result && firstbithigh(3) == 1; + result = result && firstbithigh(4) == 2; + result = result && firstbithigh(5) == 2; + result = result && firstbithigh(6) == 2; + result = result && firstbithigh(7) == 2; + result = result && firstbithigh(8) == 3; + + result = result && firstbithigh(-1) == -1; + result = result && firstbithigh(-2) == 0; + result = result && firstbithigh(-3) == 1; + result = result && firstbithigh(-4) == 1; + result = result && firstbithigh(-5) == 2; + result = result && firstbithigh(-6) == 2; + result = result && firstbithigh(-7) == 2; + result = result && firstbithigh(-8) == 2; + result = result && firstbithigh(-9) == 3; + + outputBuffer[0] = (result == true) ? 1 : 0; +} + +// CHECK: 1 diff --git a/tests/hlsl-intrinsic/firstbitlow.slang b/tests/hlsl-intrinsic/firstbitlow.slang new file mode 100644 index 000000000..418b8aa6f --- /dev/null +++ b/tests/hlsl-intrinsic/firstbitlow.slang @@ -0,0 +1,39 @@ +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cpu -compute -shaderobj +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -shaderobj +//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -dx12 -shaderobj +//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -compute -shaderobj +//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cuda -compute -shaderobj + +//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer +RWStructuredBuffer<uint> outputBuffer; + +[numthreads(10, 1, 1)] +void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID) +{ + uint idx = dispatchThreadID.x; + + if (idx < 5) { + // Test unsigned values + uint testValues[5] = {0, 1, 2, 4, 8}; + uint value = testValues[idx]; + uint result = firstbitlow(value); + outputBuffer[idx] = result; + } else { + // Test signed values + int testValues[5] = {-1, -2, -4, -8, 0}; // 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFC, 0xFFFFFFF8, 0 + int value = testValues[idx - 5]; + uint result = firstbitlow(value); + outputBuffer[idx] = result; + } +} + +// CHECK: FFFFFFFF +// CHECK: 0 +// CHECK: 1 +// CHECK: 2 +// CHECK: 3 +// CHECK: 0 +// CHECK: 1 +// CHECK: 2 +// CHECK: 3 +// CHECK: FFFFFFFF
\ No newline at end of file |
