summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCopilot <198982749+Copilot@users.noreply.github.com>2025-07-28 22:35:14 -0700
committerGitHub <noreply@github.com>2025-07-29 05:35:14 +0000
commite8797496bf7caeb08b5334d13d0d5aa039106210 (patch)
treefdfd68a6d9122c43523c37b5b20893d11aad1c06
parenta2d93ae0122aff62866e9266e5e51d4615a00783 (diff)
Fix CUDA backend missing U32_firstbitlow implementation (#7921)
* Initial plan * Add U32_firstbitlow implementation for CUDA and CPP backends Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com> * Add I32_firstbitlow and comprehensive testing for signed/unsigned firstbitlow Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com> * Convert firstbitlow test to use inline filecheck syntax Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> * Add U32_firstbithigh and I32_firstbithigh implementations for CUDA and CPP backends Co-authored-by: csyonghe <2652293+csyonghe@users.noreply.github.com> * Update prelude/slang-cpp-scalar-intrinsics.h * Update prelude/slang-cpp-scalar-intrinsics.h * Update prelude/slang-cpp-scalar-intrinsics.h * Refactor Metal bit intrinsics to handle zero case correctly Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> * Update slang-cuda-prelude.h remove fake links * Update hlsl.meta.slang * if -1, return -1 due to implicit hlsl rule * -1 or 0 is ~0u as per hlsl implictly * 0 or -1 as per hlsl * fix the math to map to hlsl * fix compile error * forgot `31 - clz` * format code (#7943) Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com> * Update source/slang/hlsl.meta.slang * Update source/slang/hlsl.meta.slang * Update source/slang/hlsl.meta.slang * Update source/slang/hlsl.meta.slang --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bmillsNV <163073245+bmillsNV@users.noreply.github.com> Co-authored-by: ArielG-NV <159081215+ArielG-NV@users.noreply.github.com> Co-authored-by: csyonghe <2652293+csyonghe@users.noreply.github.com> Co-authored-by: ArielG-NV <aglasroth@nvidia.com> Co-authored-by: slangbot <ellieh+slangbot@nvidia.com> Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
-rw-r--r--prelude/slang-cpp-scalar-intrinsics.h54
-rw-r--r--prelude/slang-cuda-prelude.h29
-rw-r--r--source/slang/hlsl.meta.slang59
-rw-r--r--tests/hlsl-intrinsic/firstbithigh.slang37
-rw-r--r--tests/hlsl-intrinsic/firstbitlow.slang39
5 files changed, 208 insertions, 10 deletions
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 9b045941a..731fd02be 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -717,6 +717,50 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
#endif
}
+SLANG_FORCE_INLINE uint32_t U32_firstbitlow(uint32_t v)
+{
+ if (v == 0)
+ return ~0u;
+
+#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
+ // __builtin_ctz returns number of trailing zeros, which is the 0-based index of first set bit
+ return __builtin_ctz(v);
+#elif SLANG_PROCESSOR_X86_64 && SLANG_VC
+ // _BitScanForward returns 1 on success, 0 on failure, and sets index
+ unsigned long index;
+ return _BitScanForward(&index, v) ? index : ~0u;
+#else
+ // Generic implementation - find first set bit
+ uint32_t result = 0;
+ while (result < 32 && !(v & (1u << result)))
+ result++;
+ return result;
+#endif
+}
+
+SLANG_FORCE_INLINE uint32_t U32_firstbithigh(uint32_t v)
+{
+ if ((int32_t)v < 0)
+ v = ~v;
+ if (v == 0)
+ return ~0u;
+#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
+ // __builtin_clz returns number of leading zeros
+ // firstbithigh should return 0-based bit position of MSB
+ return 31 - __builtin_clz(v);
+#elif SLANG_PROCESSOR_X86_64 && SLANG_VC
+ // _BitScanReverse returns 1 on success, 0 on failure, and sets index
+ unsigned long index;
+ return _BitScanReverse(&index, v) ? index : ~0u;
+#else
+ // Generic implementation - find highest set bit
+ int result = 31;
+ while (result >= 0 && !(v & (1u << result)))
+ result--;
+ return result;
+#endif
+}
+
// ----------------------------- I32 -----------------------------------------
SLANG_FORCE_INLINE int32_t I32_abs(int32_t f)
@@ -755,6 +799,16 @@ SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
return U32_countbits(uint32_t(v));
}
+SLANG_FORCE_INLINE uint32_t I32_firstbitlow(int32_t v)
+{
+ return U32_firstbitlow(uint32_t(v));
+}
+
+SLANG_FORCE_INLINE uint32_t I32_firstbithigh(int32_t v)
+{
+ return U32_firstbithigh(uint32_t(v));
+}
+
// ----------------------------- U64 -----------------------------------------
SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f)
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index fd79b77aa..3ebdbe777 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2081,10 +2081,26 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double U32_asdouble(uint32_t low, uint32_t hi
SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v)
{
- // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46
return __popc(v);
}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_firstbitlow(uint32_t v)
+{
+ // __ffs returns 1-based bit position or 0 if no bits set
+ // firstbitlow should return 0-based bit position or ~0u if no bits set
+ return v == 0 ? ~0u : (__ffs(v) - 1);
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_firstbithigh(uint32_t v)
+{
+ // maps to hlsl firstbithigh
+ if ((int32_t)v < 0)
+ v = ~v;
+ if (v == 0)
+ return ~0u;
+ return 31 - __clz(v);
+}
+
// ----------------------------- I32 -----------------------------------------
// Unary
@@ -2125,6 +2141,16 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v)
return U32_countbits(uint32_t(v));
}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_firstbitlow(int32_t v)
+{
+ return U32_firstbitlow(uint32_t(v));
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_firstbithigh(int32_t v)
+{
+ return U32_firstbithigh(uint32_t(v));
+}
+
// ----------------------------- U64 -----------------------------------------
SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_abs(uint64_t f)
@@ -2143,7 +2169,6 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t U64_max(uint64_t a, uint64_t b)
SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
{
- // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__INTRINSIC__INT.html#group__CUDA__MATH__INTRINSIC__INT_1g43c9c7d2b9ebf202ff1ef5769989be46
return __popcll(v);
}
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 2ac886f61..66d1cb5e6 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -9539,6 +9539,39 @@ vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng)
}
}
+// Helper functions for Metal target
+internal int __metal_clz(int value)
+{
+ __target_switch
+ {
+ case metal: __intrinsic_asm "clz";
+ }
+}
+
+internal uint __metal_clz(uint value)
+{
+ __target_switch
+ {
+ case metal: __intrinsic_asm "clz";
+ }
+}
+
+internal int __metal_ctz(int value)
+{
+ __target_switch
+ {
+ case metal: __intrinsic_asm "ctz";
+ }
+}
+
+internal uint __metal_ctz(uint value)
+{
+ __target_switch
+ {
+ case metal: __intrinsic_asm "ctz";
+ }
+}
+
/// Find first set bit starting at high bit and working down.
/// @param value The value to find set bits in.
/// @return The bit index number of the most significant bit,
@@ -9559,7 +9592,14 @@ int firstbithigh(int value)
case cuda: __intrinsic_asm "$P_firstbithigh($0)";
case glsl: __intrinsic_asm "findMSB";
case hlsl: __intrinsic_asm "firstbithigh";
- case metal: __intrinsic_asm "clz";
+ case metal:
+ {
+ if ((int)value < 0)
+ value = ~value;
+ if (value == 0)
+ return ~0u;
+ return 31 - __metal_clz(value);
+ }
case spirv: return spirv_asm {
OpExtInst $$int result glsl450 FindSMsb $value
};
@@ -9576,7 +9616,6 @@ vector<int, N> firstbithigh(vector<int, N> value)
{
case glsl: __intrinsic_asm "findMSB";
case hlsl: __intrinsic_asm "firstbithigh";
- case metal: __intrinsic_asm "clz";
case spirv: return spirv_asm {
OpExtInst $$vector<int, N> result glsl450 FindSMsb $value
};
@@ -9596,7 +9635,14 @@ uint firstbithigh(uint value)
case cuda: __intrinsic_asm "$P_firstbithigh($0)";
case glsl: __intrinsic_asm "findMSB";
case hlsl: __intrinsic_asm "firstbithigh";
- case metal: __intrinsic_asm "clz";
+ case metal:
+ {
+ if ((int)value < 0)
+ value = ~value;
+ if (value == 0)
+ return ~0u;
+ return 31 - __metal_clz(value);
+ }
case spirv: return spirv_asm {
OpExtInst $$uint result glsl450 FindUMsb $value
};
@@ -9613,7 +9659,6 @@ vector<uint,N> firstbithigh(vector<uint,N> value)
{
case glsl: __intrinsic_asm "findMSB";
case hlsl: __intrinsic_asm "firstbithigh";
- case metal: __intrinsic_asm "clz";
case spirv: return spirv_asm {
OpExtInst $$vector<uint,N> result glsl450 FindUMsb $value
};
@@ -9639,7 +9684,7 @@ int firstbitlow(int value)
case cuda: __intrinsic_asm "$P_firstbitlow($0)";
case glsl: __intrinsic_asm "findLSB";
case hlsl: __intrinsic_asm "firstbitlow";
- case metal: __intrinsic_asm "ctz";
+ case metal: return (value==0) ? -1 : __metal_ctz(value);
case spirv: return spirv_asm {
OpExtInst $$int result glsl450 FindILsb $value
};
@@ -9656,7 +9701,6 @@ vector<int,N> firstbitlow(vector<int,N> value)
{
case glsl: __intrinsic_asm "findLSB";
case hlsl: __intrinsic_asm "firstbitlow";
- case metal: __intrinsic_asm "ctz";
case spirv: return spirv_asm {
OpExtInst $$vector<int,N> result glsl450 FindILsb $value
};
@@ -9676,7 +9720,7 @@ uint firstbitlow(uint value)
case cuda: __intrinsic_asm "$P_firstbitlow($0)";
case glsl: __intrinsic_asm "findLSB";
case hlsl: __intrinsic_asm "firstbitlow";
- case metal: __intrinsic_asm "ctz";
+ case metal: return (value==0) ? -1 : __metal_ctz(value);
case spirv: return spirv_asm {
OpExtInst $$uint result glsl450 FindILsb $value
};
@@ -9693,7 +9737,6 @@ vector<uint,N> firstbitlow(vector<uint,N> value)
{
case glsl: __intrinsic_asm "findLSB";
case hlsl: __intrinsic_asm "firstbitlow";
- case metal: __intrinsic_asm "ctz";
case spirv: return spirv_asm {
OpExtInst $$vector<uint,N> result glsl450 FindILsb $value
};
diff --git a/tests/hlsl-intrinsic/firstbithigh.slang b/tests/hlsl-intrinsic/firstbithigh.slang
new file mode 100644
index 000000000..f5b0bc038
--- /dev/null
+++ b/tests/hlsl-intrinsic/firstbithigh.slang
@@ -0,0 +1,37 @@
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cpu -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -dx12 -shaderobj
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -compute -shaderobj
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cuda -compute -shaderobj
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ bool result = true;
+ result = result && firstbithigh(0) == -1;
+ result = result && firstbithigh(1) == 0;
+ result = result && firstbithigh(2) == 1;
+ result = result && firstbithigh(3) == 1;
+ result = result && firstbithigh(4) == 2;
+ result = result && firstbithigh(5) == 2;
+ result = result && firstbithigh(6) == 2;
+ result = result && firstbithigh(7) == 2;
+ result = result && firstbithigh(8) == 3;
+
+ result = result && firstbithigh(-1) == -1;
+ result = result && firstbithigh(-2) == 0;
+ result = result && firstbithigh(-3) == 1;
+ result = result && firstbithigh(-4) == 1;
+ result = result && firstbithigh(-5) == 2;
+ result = result && firstbithigh(-6) == 2;
+ result = result && firstbithigh(-7) == 2;
+ result = result && firstbithigh(-8) == 2;
+ result = result && firstbithigh(-9) == 3;
+
+ outputBuffer[0] = (result == true) ? 1 : 0;
+}
+
+// CHECK: 1
diff --git a/tests/hlsl-intrinsic/firstbitlow.slang b/tests/hlsl-intrinsic/firstbitlow.slang
new file mode 100644
index 000000000..418b8aa6f
--- /dev/null
+++ b/tests/hlsl-intrinsic/firstbitlow.slang
@@ -0,0 +1,39 @@
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cpu -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK):-slang -compute -dx12 -shaderobj
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-vk -compute -shaderobj
+//TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK):-cuda -compute -shaderobj
+
+//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(10, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint idx = dispatchThreadID.x;
+
+ if (idx < 5) {
+ // Test unsigned values
+ uint testValues[5] = {0, 1, 2, 4, 8};
+ uint value = testValues[idx];
+ uint result = firstbitlow(value);
+ outputBuffer[idx] = result;
+ } else {
+ // Test signed values
+ int testValues[5] = {-1, -2, -4, -8, 0}; // 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFC, 0xFFFFFFF8, 0
+ int value = testValues[idx - 5];
+ uint result = firstbitlow(value);
+ outputBuffer[idx] = result;
+ }
+}
+
+// CHECK: FFFFFFFF
+// CHECK: 0
+// CHECK: 1
+// CHECK: 2
+// CHECK: 3
+// CHECK: 0
+// CHECK: 1
+// CHECK: 2
+// CHECK: 3
+// CHECK: FFFFFFFF \ No newline at end of file