Add countbits 16-bit and 8-bit support (#6433) (#6897)

Change adds 16-bit and 8-bit support for countbits intrinsic. In cases where a backend's native counbits lacks support, support is emulated. New tests are added for 16-bit and 8-bit support. Additional testing added for 32-bit and minor updates made to 64-bit countbits.
author: sricker-nvidia <115114531+sricker-nvidia@users.noreply.github.com> 2025-05-05 15:30:33 -0700
committer: GitHub <noreply@github.com> 2025-05-05 22:30:33 +0000
commit: 50d9781b7387b0f7f56d19c72afcf390cca72b72 (patch)
tree: 7b6f1401f7a8257fa378930a052ca63f0fda91f4
parent: 698e43372cefe0fff13150925aeb7f389c21a938 (diff)
7 files changed, 290 insertions, 86 deletions
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 0a19eb327..9b045941a 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -628,45 +628,13 @@ SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians)
     return (a * (SLANG_PRELUDE_PI * 2));
 }
 
-// ----------------------------- I32 -----------------------------------------
-
-SLANG_FORCE_INLINE int32_t I32_abs(int32_t f)
-{
-    return (f < 0) ? -f : f;
-}
-
-SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b)
-{
-    return a < b ? a : b;
-}
-SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b)
-{
-    return a > b ? a : b;
-}
-
-SLANG_FORCE_INLINE float I32_asfloat(int32_t x)
-{
-    Union32 u;
-    u.i = x;
-    return u.f;
-}
-SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x)
-{
-    return uint32_t(x);
-}
-SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi)
-{
-    Union64 u;
-    u.u = (uint64_t(hi) << 32) | uint32_t(low);
-    return u.d;
-}
-
-SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
+// ----------------------------- U16 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t U16_countbits(uint16_t v)
 {
 #if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
     return __builtin_popcount(uint32_t(v));
 #elif SLANG_PROCESSOR_X86_64 && SLANG_VC
-    return __popcnt(uint32_t(v));
+    return __popcnt16(v);
 #else
     uint32_t c = 0;
     while (v)
@@ -678,6 +646,25 @@ SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
 #endif
 }
 
+// ----------------------------- I16 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t I16_countbits(int16_t v)
+{
+    return U16_countbits(uint16_t(v));
+}
+
+// ----------------------------- U8 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t U8_countbits(uint8_t v)
+{
+    // No native 8bit __popcnt yet, just cast and use 16bit variant
+    return U16_countbits(uint16_t(v));
+}
+
+// ----------------------------- I8 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t I8_countbits(int16_t v)
+{
+    return U8_countbits(uint8_t(v));
+}
+
 // ----------------------------- U32 -----------------------------------------
 
 SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f)
@@ -730,6 +717,44 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
 #endif
 }
 
+// ----------------------------- I32 -----------------------------------------
+
+SLANG_FORCE_INLINE int32_t I32_abs(int32_t f)
+{
+    return (f < 0) ? -f : f;
+}
+
+SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b)
+{
+    return a > b ? a : b;
+}
+
+SLANG_FORCE_INLINE float I32_asfloat(int32_t x)
+{
+    Union32 u;
+    u.i = x;
+    return u.f;
+}
+SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x)
+{
+    return uint32_t(x);
+}
+SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi)
+{
+    Union64 u;
+    u.u = (uint64_t(hi) << 32) | uint32_t(low);
+    return u.d;
+}
+
+SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
+{
+    return U32_countbits(uint32_t(v));
+}
+
 // ----------------------------- U64 -----------------------------------------
 
 SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f)
@@ -749,7 +774,7 @@ SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b)
 SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v)
 {
 #if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
-    return uint32_t(__builtin_popcountl(v));
+    return uint32_t(__builtin_popcountll(v));
 #elif SLANG_PROCESSOR_X86_64 && SLANG_VC
     return uint32_t(__popcnt64(v));
 #else
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 738f2fa16..91ff98a17 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1788,44 +1788,34 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c)
     return ::fma(a, b, c);
 }
 
-// ----------------------------- I32 -----------------------------------------
+// ----------------------------- U8 -----------------------------------------
 
-// Unary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U8_countbits(uint8_t v)
 {
-    return (f < 0) ? -f : f;
+    // No native 8bit popc yet, just cast and use 32bit variant
+    return __popc(uint32_t(v));
 }
 
-// Binary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b)
-{
-    return a < b ? a : b;
-}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b)
-{
-    return a > b ? a : b;
-}
+// ----------------------------- I8 -----------------------------------------
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I8_countbits(int8_t v)
 {
-    Union32 u;
-    u.i = x;
-    return u.f;
+    return U8_countbits(uint8_t(v));
 }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x)
-{
-    return uint32_t(x);
-}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi)
+
+// ----------------------------- U16 -----------------------------------------
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U16_countbits(uint16_t v)
 {
-    Union64 u;
-    u.u = (uint64_t(hi) << 32) | uint32_t(low);
-    return u.d;
+    // No native 16bit popc yet, just cast and use 32bit variant
+    return __popc(uint32_t(v));
 }
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v)
+// ----------------------------- I16 -----------------------------------------
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I16_countbits(int16_t v)
 {
-    return __popc(uint32_t(v));
+    return U16_countbits(uint16_t(v));
 }
 
 // ----------------------------- U32 -----------------------------------------
@@ -1870,26 +1860,44 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v)
     return __popc(v);
 }
 
+// ----------------------------- I32 -----------------------------------------
 
-// ----------------------------- I64 -----------------------------------------
-
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f)
+// Unary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f)
 {
     return (f < 0) ? -f : f;
 }
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b)
+// Binary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b)
 {
     return a < b ? a : b;
 }
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b)
 {
     return a > b ? a : b;
 }
 
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I64_countbits(int64_t v)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x)
+{
+    Union32 u;
+    u.i = x;
+    return u.f;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x)
+{
+    return uint32_t(x);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi)
+{
+    Union64 u;
+    u.u = (uint64_t(hi) << 32) | uint32_t(low);
+    return u.d;
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v)
 {
-    return __popcll(uint64_t(v));
+    return U32_countbits(uint32_t(v));
 }
 
 // ----------------------------- U64 -----------------------------------------
@@ -1914,6 +1922,27 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
     return __popcll(v);
 }
 
+// ----------------------------- I64 -----------------------------------------
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f)
+{
+    return (f < 0) ? -f : f;
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b)
+{
+    return a < b ? a : b;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b)
+{
+    return a > b ? a : b;
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I64_countbits(int64_t v)
+{
+    return U64_countbits(uint64_t(v));
+}
+
 // ----------------------------- IPTR -----------------------------------------
 
 SLANG_FORCE_INLINE SLANG_CUDA_CALL intptr_t IPTR_abs(intptr_t f)
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 0f04006e5..07160ae9d 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -8047,6 +8047,16 @@ vector<T,N> cospi(vector<T,N> x)
     }
 }
 
+// emulate 64-bit countbits when not natively supported.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+internal uint __emulatedCountbits64(uint64_t value)
+{
+    uint2 value_uint2 = bit_cast<uint2>(value);
+    uint2 counted_bits_uint2 = countbits(value_uint2);
+    return counted_bits_uint2.x + counted_bits_uint2.y;
+}
 
 /// Population count.
 /// Counts the number of set bits in the binary representation of a value.
@@ -8060,19 +8070,32 @@ vector<T,N> cospi(vector<T,N> x)
 __generic<T : __BuiltinIntegerType>
 uint countbits(T value)
 {
+    // Emulate 8-bit support
+    // 8-bit support is not currently supported anywhere natively
+    if (T is int8_t || T  is uint8_t)
+    {
+        return countbits(__intCast<uint32_t>(value));
+    }
+
     __target_switch
     {
     case hlsl:
+        // 64-bit support dependent on SM6.0 and dxil
+        // 16-bit support dependent on SM6.2 and dxil
         __intrinsic_asm "countbits";
     case glsl:
+        // 64-bit support dependent on GL_ARB_gpu_shader_int64
+        // 16-bit support dependent on GL_EXT_shader_16bit_storage
         __intrinsic_asm "bitCount";
     case metal:
-        if(T is int64_t || T  is uint64_t)
+        if (T is int64_t || T  is uint64_t)
+        {
+            return __emulatedCountbits64(__intCast<uint64_t>(value));
+        }
+        else if (T is int16_t || T  is uint16_t)
         {
-            // emulate 64-bit
-            uint2 value_uint2 = bit_cast<uint2>(value);
-            uint2 counted_bits_uint2 = countbits(value_uint2);
-            return counted_bits_uint2.x + counted_bits_uint2.y;
+            // emulate 16-bit
+            return countbits(__intCast<uint32_t>(value));
         }
         else
         {
@@ -8084,17 +8107,28 @@ uint countbits(T value)
     case spirv:
         if(T is int64_t || T  is uint64_t)
         {
-            // emulate 64-bit
-            uint2 value_uint2 = bit_cast<uint2>(value);
-            uint2 counted_bits_uint2 = countbits(value_uint2);
-            return counted_bits_uint2.x + counted_bits_uint2.y;
+            return __emulatedCountbits64(__intCast<uint64_t>(value));
+        }
+        else if (T is int16_t || T  is uint16_t)
+        {
+            // emulate 16-bit
+            return countbits(__intCast<uint32_t>(value));
         }
         else
         {
+            // OpBitCount only supports 32-bit
             return spirv_asm {OpBitCount $$uint result $value};
         }
     case wgsl:
-        __intrinsic_asm "countOneBits";
+            // wgsl only supports 32-bit integers
+            if (T is int32_t)
+            {
+                // wgsl countOneBits returns the same type as the
+                // one it was given. Cast signed ints to unsigned
+                // so we can provide the correct return value.
+                return countbits(__intCast<uint32_t>(value));
+            }
+            __intrinsic_asm "countOneBits";
     }
 }
 
@@ -8104,6 +8138,13 @@ __generic<T : __BuiltinIntegerType, let N : int>
 [require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
 vector<uint, N> countbits(vector<T, N> value)
 {
+    // Emulate 8-bit support
+    // 8-bit support is not currently supported anywhere natively
+    if (T is int8_t || T  is uint8_t)
+    {
+        VECTOR_MAP_UNARY(uint, N, countbits, value);
+    }
+
     __target_switch
     {
     case hlsl:
@@ -8111,9 +8152,9 @@ vector<uint, N> countbits(vector<T, N> value)
     case glsl:
         __intrinsic_asm "bitCount";
     case metal:
-        if(T is int64_t || T  is uint64_t)
+        if(T is int64_t || T  is uint64_t || T is int16_t || T  is uint16_t)
         {
-            // emulate 64-bit
+            // Emulate 64-bit and 16-bit
             VECTOR_MAP_UNARY(uint, N, countbits, value);
         }
         else
@@ -8121,9 +8162,9 @@ vector<uint, N> countbits(vector<T, N> value)
             __intrinsic_asm "popcount";
         }
     case spirv:
-        if(T is int64_t || T  is uint64_t)
+        if(T is int64_t || T  is uint64_t || T is int16_t || T  is uint16_t)
         {
-            // emulate 64-bit
+            // Emulate 64-bit and 16-bit
             VECTOR_MAP_UNARY(uint, N, countbits, value);
         }
         else
@@ -8131,7 +8172,17 @@ vector<uint, N> countbits(vector<T, N> value)
             return spirv_asm {OpBitCount $$vector<uint, N> result $value};
         }
     case wgsl:
-        __intrinsic_asm "countOneBits";
+        // wgsl only supports 32-bit integers
+        if (T is int32_t)
+        {
+            vector<uint32_t, N> ret;
+            for (int i = 0; i < N; i++)
+            {
+                ret[i] = countbits(__intCast<uint32_t>(value[i]));
+            }
+            return ret;
+        }
+            __intrinsic_asm "countOneBits";
     default:
         VECTOR_MAP_UNARY(uint, N, countbits, value);
     }
diff --git a/tests/hlsl-intrinsic/countbits.slang b/tests/hlsl-intrinsic/countbits.slang
index da6828e87..060ad98f4 100644
--- a/tests/hlsl-intrinsic/countbits.slang
+++ b/tests/hlsl-intrinsic/countbits.slang
@@ -2,8 +2,10 @@
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx11
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -wgpu -compute
+// TODO: test GLSL pathway once emit-spirv-via-glsl is fixed and shader output reading is fixed for GLSL
 
 //CHK:1
 
diff --git a/tests/hlsl-intrinsic/countbits16.slang b/tests/hlsl-intrinsic/countbits16.slang
new file mode 100644
index 000000000..dbfdc9217
--- /dev/null
+++ b/tests/hlsl-intrinsic/countbits16.slang
@@ -0,0 +1,47 @@
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
+//TODO: metal is currently failing even with emulation, investigate.
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute -profile metallib_2_4
+// No support for uint16_t on fxc - we need SM6.2 and dxil to use uint16_t with d3d12
+// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_2 -use-dxil -shaderobj -render-feature hardware-device
+// wgpu only has 32-bit support, so we do not try and test it here
+// TODO: test GLSL pathway once emit-spirv-via-glsl is fixed and shader output reading is fixed for GLSL
+
+//CHK:1
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint r1 = countbits(uint16_t(0b1U) << 8);
+    uint2 r2 = countbits(uint16_t2(uint16_t(0b0U) << 8, uint16_t(0b1U) << 8));
+    uint3 r3 = countbits(uint16_t3(uint16_t(0b0U) << 8, uint16_t(0b1U) << 8, uint16_t(0b11U) << 8));
+    uint4 r4 = countbits(uint16_t4(uint16_t(0b0U) << 8, uint16_t(0b1U) << 8, uint16_t(0b11U) << 8, uint16_t(0b111U) << 8));
+
+    uint r5 = countbits(int16_t(0b1) << 8);
+    uint2 r6 = countbits(int16_t2(int16_t(0b0) << 8, int16_t(0b1) << 8));
+    uint3 r7 = countbits(int16_t3(int16_t(0b0) << 8, int16_t(0b1) << 8, int16_t(0b11) << 8));
+    uint4 r8 = countbits(int16_t4(int16_t(0b0) << 8, int16_t(0b1) << 8, int16_t(0b11) << 8, int16_t(0b111) << 8));
+
+    uint16_t smallShiftU16 = uint16_t(0b111) << 16;
+    int16_t smallShiftI16 = int16_t(0b1111) << 16;
+
+    uint bitCountBigShiftU16 = countbits(smallShiftU16);
+    uint bitCountBigShiftI16 = countbits(smallShiftI16);
+
+    outputBuffer[0] = true
+        && (r1 == 1)
+        && (r2.x == 0 && r2.y == 1)
+        && (r3.x == 0 && r3.y == 1 && r3.z == 2)
+        && (r4.x == 0 && r4.y == 1 && r4.z == 2 && r4.w == 3)
+        && (r5 == 1)
+        && (r6.x == 0 && r6.y == 1)
+        && (r7.x == 0 && r7.y == 1 && r7.z == 2)
+        && (r8.x == 0 && r8.y == 1 && r8.z == 2 && r8.w == 3)
+        && (bitCountBigShiftU16 == 0 && bitCountBigShiftI16 == 0)
+        ;
+}
diff --git a/tests/hlsl-intrinsic/countbits64.slang b/tests/hlsl-intrinsic/countbits64.slang
index a24b31477..90799e411 100644
--- a/tests/hlsl-intrinsic/countbits64.slang
+++ b/tests/hlsl-intrinsic/countbits64.slang
@@ -1,10 +1,14 @@
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute -render-feature int64
+// emit-spirv-via-glsl is currently ignored, but even working around this, output does not appear to be captured for GLSL
+// No support for uint64_t in GLSL without an extension like GL_EXT_shader_explicit_arithmetic_types_int64
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -vk -compute -render-feature int64 -emit-spirv-via-glsl -profile GLSL_400 -Xslang... -capability GL_EXT_shader_explicit_arithmetic_types_int64.
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute
 // No support for uint64_t on fxc - we need SM6.0 and dxil to use uint64_t with d3d12
 // https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
 //TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -render-feature hardware-device
+// wgpu only has 32-bit support, so we do not try and test it here
 
 //CHK:1
 
diff --git a/tests/hlsl-intrinsic/countbits8.slang b/tests/hlsl-intrinsic/countbits8.slang
new file mode 100644
index 000000000..1db8e805c
--- /dev/null
+++ b/tests/hlsl-intrinsic/countbits8.slang
@@ -0,0 +1,46 @@
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
+//TODO: metal is currently failing even with emulation, investigate.
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute -profile metallib_2_4
+// Not testing the following:
+// -dx12/hlsl, No support for uint8_t with hlsl
+// -wgpu, only has 32-bit support
+// -vk/glsl, No support for uint8_t with glsl
+
+//CHK:1
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint r1 = countbits(uint8_t(0b1U) << 4);
+    uint2 r2 = countbits(uint8_t2(uint8_t(0b0U) << 4, uint8_t(0b1U) << 4));
+    uint3 r3 = countbits(uint8_t3(uint8_t(0b0U) << 4, uint8_t(0b1U) << 4, uint8_t(0b11U) << 4));
+    uint4 r4 = countbits(uint8_t4(uint8_t(0b0U) << 4, uint8_t(0b1U) << 4, uint8_t(0b11U) << 4, uint8_t(0b111U) << 4));
+
+    uint r5 = countbits(int8_t(0b1) << 4);
+    uint2 r6 = countbits(int8_t2(int8_t(0b0) << 4, int8_t(0b1) << 4));
+    uint3 r7 = countbits(int8_t3(int8_t(0b0) << 4, int8_t(0b1) << 4, int8_t(0b11) << 4));
+    uint4 r8 = countbits(int8_t4(int8_t(0b0) << 4, int8_t(0b1) << 4, int8_t(0b11) << 4, int8_t(0b111) << 4));
+
+    uint8_t smallShiftU8 = uint8_t(0b111) << 8;
+    int8_t smallShiftI8 = int8_t(0b1111) << 8;
+
+    uint bitCountBigShiftU8 = countbits(smallShiftU8);
+    uint bitCountBigShiftI8 = countbits(smallShiftI8);
+
+    outputBuffer[0] = true
+        && (r1 == 1)
+        && (r2.x == 0 && r2.y == 1)
+        && (r3.x == 0 && r3.y == 1 && r3.z == 2)
+        && (r4.x == 0 && r4.y == 1 && r4.z == 2 && r4.w == 3)
+        && (r5 == 1)
+        && (r6.x == 0 && r6.y == 1)
+        && (r7.x == 0 && r7.y == 1 && r7.z == 2)
+        && (r8.x == 0 && r8.y == 1 && r8.z == 2 && r8.w == 3)
+        && (bitCountBigShiftU8 == 0 && bitCountBigShiftI8 == 0)
+        ;
+}
author	sricker-nvidia <115114531+sricker-nvidia@users.noreply.github.com>	2025-05-05 15:30:33 -0700
committer	GitHub <noreply@github.com>	2025-05-05 22:30:33 +0000
commit	50d9781b7387b0f7f56d19c72afcf390cca72b72 (patch)
tree	7b6f1401f7a8257fa378930a052ca63f0fda91f4
parent	698e43372cefe0fff13150925aeb7f389c21a938 (diff)