From e8797496bf7caeb08b5334d13d0d5aa039106210 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Mon, 28 Jul 2025 22:35:14 -0700 Subject: Fix CUDA backend missing U32_firstbitlow implementation (#7921) * Initial plan * Add U32_firstbitlow implementation for CUDA and CPP backends Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com> * Add I32_firstbitlow and comprehensive testing for signed/unsigned firstbitlow Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com> * Convert firstbitlow test to use inline filecheck syntax Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> * Add U32_firstbithigh and I32_firstbithigh implementations for CUDA and CPP backends Co-authored-by: csyonghe <2652293+csyonghe@users.noreply.github.com> * Update prelude/slang-cpp-scalar-intrinsics.h * Update prelude/slang-cpp-scalar-intrinsics.h * Update prelude/slang-cpp-scalar-intrinsics.h * Refactor Metal bit intrinsics to handle zero case correctly Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> * Update slang-cuda-prelude.h remove fake links * Update hlsl.meta.slang * if -1, return -1 due to implicit hlsl rule * -1 or 0 is ~0u as per hlsl implictly * 0 or -1 as per hlsl * fix the math to map to hlsl * fix compile error * forgot `31 - clz` * format code (#7943) Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> * Update source/slang/hlsl.meta.slang * Update source/slang/hlsl.meta.slang * Update source/slang/hlsl.meta.slang * Update source/slang/hlsl.meta.slang --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com> Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> Co-authored-by: csyonghe <2652293+csyonghe@users.noreply.github.com> Co-authored-by: ArielG-NV Co-authored-by: slangbot Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> --- prelude/slang-cpp-scalar-intrinsics.h | 54 +++++++++++++++++++++++++++++++++++ prelude/slang-cuda-prelude.h | 29 +++++++++++++++++-- 2 files changed, 81 insertions(+), 2 deletions(-) (limited to 'prelude') diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h index 9b045941a..731fd02be 100644 --- a/prelude/slang-cpp-scalar-intrinsics.h +++ b/prelude/slang-cpp-scalar-intrinsics.h @@ -717,6 +717,50 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v) #endif } +SLANG_FORCE_INLINE uint32_t U32_firstbitlow(uint32_t v) +{ + if (v == 0) + return ~0u; + +#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) + // __builtin_ctz returns number of trailing zeros, which is the 0-based index of first set bit + return __builtin_ctz(v); +#elif SLANG_PROCESSOR_X86_64 && SLANG_VC + // _BitScanForward returns 1 on success, 0 on failure, and sets index + unsigned long index; + return _BitScanForward(&index, v) ? index : ~0u; +#else + // Generic implementation - find first set bit + uint32_t result = 0; + while (result < 32 && !(v & (1u << result))) + result++; + return result; +#endif +} + +SLANG_FORCE_INLINE uint32_t U32_firstbithigh(uint32_t v) +{ + if ((int32_t)v < 0) + v = ~v; + if (v == 0) + return ~0u; +#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM) + // __builtin_clz returns number of leading zeros + // firstbithigh should return 0-based bit position of MSB + return 31 - __builtin_clz(v); +#elif SLANG_PROCESSOR_X86_64 && SLANG_VC + // _BitScanReverse returns 1 on success, 0 on failure, and sets index + unsigned long index; + return _BitScanReverse(&index, v) ? index : ~0u; +#else + // Generic implementation - find highest set bit + int result = 31; + while (result >= 0 && !(v & (1u << result))) + result--; + return result; +#endif +} + // ----------------------------- I32 ----------------------------------------- SLANG_FORCE_INLINE int32_t I32_abs(int32_t f) @@ -755,6 +799,16 @@ SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v) return U32_countbits(uint32_t(v)); } +SLANG_FORCE_INLINE uint32_t I32_firstbitlow(int32_t v) +{ + return U32_firstbitlow(uint32_t(v)); +} + +SLANG_FORCE_INLINE uint32_t I32_firstbithigh(int32_t v) +{ + return U32_firstbithigh(uint32_t(v)); +} + // ----------------------------- U64 ----------------------------------------- SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f) diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h index fd79b77aa..3ebdbe777 100644 --- a/prelude/slang-cuda-prelude.h +++ b/prelude/slang-cuda-prelude.h @@ -2081,10 +2081,26 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v) { - // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46 return __popc(v); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_firstbitlow(uint32_t v) +{ + // __ffs returns 1-based bit position or 0 if no bits set + // firstbitlow should return 0-based bit position or ~0u if no bits set + return v == 0 ? ~0u : (__ffs(v) - 1); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_firstbithigh(uint32_t v) +{ + // maps to hlsl firstbithigh + if ((int32_t)v < 0) + v = ~v; + if (v == 0) + return ~0u; + return 31 - __clz(v); +} + // ----------------------------- I32 ----------------------------------------- // Unary @@ -2125,6 +2141,16 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v) return U32_countbits(uint32_t(v)); } +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_firstbitlow(int32_t v) +{ + return U32_firstbitlow(uint32_t(v)); +} + +SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_firstbithigh(int32_t v) +{ + return U32_firstbithigh(uint32_t(v)); +} + // ----------------------------- U64 ----------------------------------------- SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f) @@ -2143,7 +2169,6 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b) SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v) { - // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46 return __popcll(v); } -- cgit v1.2.3