summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsricker-nvidia <115114531+sricker-nvidia@users.noreply.github.com>2025-05-05 15:30:33 -0700
committerGitHub <noreply@github.com>2025-05-05 22:30:33 +0000
commit50d9781b7387b0f7f56d19c72afcf390cca72b72 (patch)
tree7b6f1401f7a8257fa378930a052ca63f0fda91f4
parent698e43372cefe0fff13150925aeb7f389c21a938 (diff)
Add countbits 16-bit and 8-bit support (#6433) (#6897)
Change adds 16-bit and 8-bit support for countbits intrinsic. In cases where a backend's native counbits lacks support, support is emulated. New tests are added for 16-bit and 8-bit support. Additional testing added for 32-bit and minor updates made to 64-bit countbits.
-rw-r--r--prelude/slang-cpp-scalar-intrinsics.h97
-rw-r--r--prelude/slang-cuda-prelude.h97
-rw-r--r--source/slang/hlsl.meta.slang81
-rw-r--r--tests/hlsl-intrinsic/countbits.slang4
-rw-r--r--tests/hlsl-intrinsic/countbits16.slang47
-rw-r--r--tests/hlsl-intrinsic/countbits64.slang4
-rw-r--r--tests/hlsl-intrinsic/countbits8.slang46
7 files changed, 290 insertions, 86 deletions
diff --git a/prelude/slang-cpp-scalar-intrinsics.h b/prelude/slang-cpp-scalar-intrinsics.h
index 0a19eb327..9b045941a 100644
--- a/prelude/slang-cpp-scalar-intrinsics.h
+++ b/prelude/slang-cpp-scalar-intrinsics.h
@@ -628,45 +628,13 @@ SLANG_FORCE_INLINE double F64_calcSafeRadians(double radians)
return (a * (SLANG_PRELUDE_PI * 2));
}
-// ----------------------------- I32 -----------------------------------------
-
-SLANG_FORCE_INLINE int32_t I32_abs(int32_t f)
-{
- return (f < 0) ? -f : f;
-}
-
-SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b)
-{
- return a < b ? a : b;
-}
-SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b)
-{
- return a > b ? a : b;
-}
-
-SLANG_FORCE_INLINE float I32_asfloat(int32_t x)
-{
- Union32 u;
- u.i = x;
- return u.f;
-}
-SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x)
-{
- return uint32_t(x);
-}
-SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi)
-{
- Union64 u;
- u.u = (uint64_t(hi) << 32) | uint32_t(low);
- return u.d;
-}
-
-SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
+// ----------------------------- U16 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t U16_countbits(uint16_t v)
{
#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
return __builtin_popcount(uint32_t(v));
#elif SLANG_PROCESSOR_X86_64 && SLANG_VC
- return __popcnt(uint32_t(v));
+ return __popcnt16(v);
#else
uint32_t c = 0;
while (v)
@@ -678,6 +646,25 @@ SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
#endif
}
+// ----------------------------- I16 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t I16_countbits(int16_t v)
+{
+ return U16_countbits(uint16_t(v));
+}
+
+// ----------------------------- U8 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t U8_countbits(uint8_t v)
+{
+ // No native 8bit __popcnt yet, just cast and use 16bit variant
+ return U16_countbits(uint16_t(v));
+}
+
+// ----------------------------- I8 -----------------------------------------
+SLANG_FORCE_INLINE uint32_t I8_countbits(int16_t v)
+{
+ return U8_countbits(uint8_t(v));
+}
+
// ----------------------------- U32 -----------------------------------------
SLANG_FORCE_INLINE uint32_t U32_abs(uint32_t f)
@@ -730,6 +717,44 @@ SLANG_FORCE_INLINE uint32_t U32_countbits(uint32_t v)
#endif
}
+// ----------------------------- I32 -----------------------------------------
+
+SLANG_FORCE_INLINE int32_t I32_abs(int32_t f)
+{
+ return (f < 0) ? -f : f;
+}
+
+SLANG_FORCE_INLINE int32_t I32_min(int32_t a, int32_t b)
+{
+ return a < b ? a : b;
+}
+SLANG_FORCE_INLINE int32_t I32_max(int32_t a, int32_t b)
+{
+ return a > b ? a : b;
+}
+
+SLANG_FORCE_INLINE float I32_asfloat(int32_t x)
+{
+ Union32 u;
+ u.i = x;
+ return u.f;
+}
+SLANG_FORCE_INLINE uint32_t I32_asuint(int32_t x)
+{
+ return uint32_t(x);
+}
+SLANG_FORCE_INLINE double I32_asdouble(int32_t low, int32_t hi)
+{
+ Union64 u;
+ u.u = (uint64_t(hi) << 32) | uint32_t(low);
+ return u.d;
+}
+
+SLANG_FORCE_INLINE uint32_t I32_countbits(int32_t v)
+{
+ return U32_countbits(uint32_t(v));
+}
+
// ----------------------------- U64 -----------------------------------------
SLANG_FORCE_INLINE uint64_t U64_abs(uint64_t f)
@@ -749,7 +774,7 @@ SLANG_FORCE_INLINE uint64_t U64_max(uint64_t a, uint64_t b)
SLANG_FORCE_INLINE uint32_t U64_countbits(uint64_t v)
{
#if SLANG_GCC_FAMILY && !defined(SLANG_LLVM)
- return uint32_t(__builtin_popcountl(v));
+ return uint32_t(__builtin_popcountll(v));
#elif SLANG_PROCESSOR_X86_64 && SLANG_VC
return uint32_t(__popcnt64(v));
#else
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 738f2fa16..91ff98a17 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -1788,44 +1788,34 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL double F64_fma(double a, double b, double c)
return ::fma(a, b, c);
}
-// ----------------------------- I32 -----------------------------------------
+// ----------------------------- U8 -----------------------------------------
-// Unary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U8_countbits(uint8_t v)
{
- return (f < 0) ? -f : f;
+ // No native 8bit popc yet, just cast and use 32bit variant
+ return __popc(uint32_t(v));
}
-// Binary
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b)
-{
- return a < b ? a : b;
-}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b)
-{
- return a > b ? a : b;
-}
+// ----------------------------- I8 -----------------------------------------
-SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I8_countbits(int8_t v)
{
- Union32 u;
- u.i = x;
- return u.f;
+ return U8_countbits(uint8_t(v));
}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x)
-{
- return uint32_t(x);
-}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi)
+
+// ----------------------------- U16 -----------------------------------------
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U16_countbits(uint16_t v)
{
- Union64 u;
- u.u = (uint64_t(hi) << 32) | uint32_t(low);
- return u.d;
+ // No native 16bit popc yet, just cast and use 32bit variant
+ return __popc(uint32_t(v));
}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v)
+// ----------------------------- I16 -----------------------------------------
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I16_countbits(int16_t v)
{
- return __popc(uint32_t(v));
+ return U16_countbits(uint16_t(v));
}
// ----------------------------- U32 -----------------------------------------
@@ -1870,26 +1860,44 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U32_countbits(uint32_t v)
return __popc(v);
}
+// ----------------------------- I32 -----------------------------------------
-// ----------------------------- I64 -----------------------------------------
-
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f)
+// Unary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_abs(int32_t f)
{
return (f < 0) ? -f : f;
}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b)
+// Binary
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_min(int32_t a, int32_t b)
{
return a < b ? a : b;
}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int32_t I32_max(int32_t a, int32_t b)
{
return a > b ? a : b;
}
-SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I64_countbits(int64_t v)
+SLANG_FORCE_INLINE SLANG_CUDA_CALL float I32_asfloat(int32_t x)
+{
+ Union32 u;
+ u.i = x;
+ return u.f;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_asuint(int32_t x)
+{
+ return uint32_t(x);
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL double I32_asdouble(int32_t low, int32_t hi)
+{
+ Union64 u;
+ u.u = (uint64_t(hi) << 32) | uint32_t(low);
+ return u.d;
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I32_countbits(int32_t v)
{
- return __popcll(uint64_t(v));
+ return U32_countbits(uint32_t(v));
}
// ----------------------------- U64 -----------------------------------------
@@ -1914,6 +1922,27 @@ SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t U64_countbits(uint64_t v)
return __popcll(v);
}
+// ----------------------------- I64 -----------------------------------------
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_abs(int64_t f)
+{
+ return (f < 0) ? -f : f;
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_min(int64_t a, int64_t b)
+{
+ return a < b ? a : b;
+}
+SLANG_FORCE_INLINE SLANG_CUDA_CALL int64_t I64_max(int64_t a, int64_t b)
+{
+ return a > b ? a : b;
+}
+
+SLANG_FORCE_INLINE SLANG_CUDA_CALL uint32_t I64_countbits(int64_t v)
+{
+ return U64_countbits(uint64_t(v));
+}
+
// ----------------------------- IPTR -----------------------------------------
SLANG_FORCE_INLINE SLANG_CUDA_CALL intptr_t IPTR_abs(intptr_t f)
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 0f04006e5..07160ae9d 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -8047,6 +8047,16 @@ vector<T,N> cospi(vector<T,N> x)
}
}
+// emulate 64-bit countbits when not natively supported.
+[__readNone]
+[ForceInline]
+[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
+internal uint __emulatedCountbits64(uint64_t value)
+{
+ uint2 value_uint2 = bit_cast<uint2>(value);
+ uint2 counted_bits_uint2 = countbits(value_uint2);
+ return counted_bits_uint2.x + counted_bits_uint2.y;
+}
/// Population count.
/// Counts the number of set bits in the binary representation of a value.
@@ -8060,19 +8070,32 @@ vector<T,N> cospi(vector<T,N> x)
__generic<T : __BuiltinIntegerType>
uint countbits(T value)
{
+ // Emulate 8-bit support
+ // 8-bit support is not currently supported anywhere natively
+ if (T is int8_t || T is uint8_t)
+ {
+ return countbits(__intCast<uint32_t>(value));
+ }
+
__target_switch
{
case hlsl:
+ // 64-bit support dependent on SM6.0 and dxil
+ // 16-bit support dependent on SM6.2 and dxil
__intrinsic_asm "countbits";
case glsl:
+ // 64-bit support dependent on GL_ARB_gpu_shader_int64
+ // 16-bit support dependent on GL_EXT_shader_16bit_storage
__intrinsic_asm "bitCount";
case metal:
- if(T is int64_t || T is uint64_t)
+ if (T is int64_t || T is uint64_t)
+ {
+ return __emulatedCountbits64(__intCast<uint64_t>(value));
+ }
+ else if (T is int16_t || T is uint16_t)
{
- // emulate 64-bit
- uint2 value_uint2 = bit_cast<uint2>(value);
- uint2 counted_bits_uint2 = countbits(value_uint2);
- return counted_bits_uint2.x + counted_bits_uint2.y;
+ // emulate 16-bit
+ return countbits(__intCast<uint32_t>(value));
}
else
{
@@ -8084,17 +8107,28 @@ uint countbits(T value)
case spirv:
if(T is int64_t || T is uint64_t)
{
- // emulate 64-bit
- uint2 value_uint2 = bit_cast<uint2>(value);
- uint2 counted_bits_uint2 = countbits(value_uint2);
- return counted_bits_uint2.x + counted_bits_uint2.y;
+ return __emulatedCountbits64(__intCast<uint64_t>(value));
+ }
+ else if (T is int16_t || T is uint16_t)
+ {
+ // emulate 16-bit
+ return countbits(__intCast<uint32_t>(value));
}
else
{
+ // OpBitCount only supports 32-bit
return spirv_asm {OpBitCount $$uint result $value};
}
case wgsl:
- __intrinsic_asm "countOneBits";
+ // wgsl only supports 32-bit integers
+ if (T is int32_t)
+ {
+ // wgsl countOneBits returns the same type as the
+ // one it was given. Cast signed ints to unsigned
+ // so we can provide the correct return value.
+ return countbits(__intCast<uint32_t>(value));
+ }
+ __intrinsic_asm "countOneBits";
}
}
@@ -8104,6 +8138,13 @@ __generic<T : __BuiltinIntegerType, let N : int>
[require(cpp_cuda_glsl_hlsl_metal_spirv_wgsl, shader5_sm_5_0)]
vector<uint, N> countbits(vector<T, N> value)
{
+ // Emulate 8-bit support
+ // 8-bit support is not currently supported anywhere natively
+ if (T is int8_t || T is uint8_t)
+ {
+ VECTOR_MAP_UNARY(uint, N, countbits, value);
+ }
+
__target_switch
{
case hlsl:
@@ -8111,9 +8152,9 @@ vector<uint, N> countbits(vector<T, N> value)
case glsl:
__intrinsic_asm "bitCount";
case metal:
- if(T is int64_t || T is uint64_t)
+ if(T is int64_t || T is uint64_t || T is int16_t || T is uint16_t)
{
- // emulate 64-bit
+ // Emulate 64-bit and 16-bit
VECTOR_MAP_UNARY(uint, N, countbits, value);
}
else
@@ -8121,9 +8162,9 @@ vector<uint, N> countbits(vector<T, N> value)
__intrinsic_asm "popcount";
}
case spirv:
- if(T is int64_t || T is uint64_t)
+ if(T is int64_t || T is uint64_t || T is int16_t || T is uint16_t)
{
- // emulate 64-bit
+ // Emulate 64-bit and 16-bit
VECTOR_MAP_UNARY(uint, N, countbits, value);
}
else
@@ -8131,7 +8172,17 @@ vector<uint, N> countbits(vector<T, N> value)
return spirv_asm {OpBitCount $$vector<uint, N> result $value};
}
case wgsl:
- __intrinsic_asm "countOneBits";
+ // wgsl only supports 32-bit integers
+ if (T is int32_t)
+ {
+ vector<uint32_t, N> ret;
+ for (int i = 0; i < N; i++)
+ {
+ ret[i] = countbits(__intCast<uint32_t>(value[i]));
+ }
+ return ret;
+ }
+ __intrinsic_asm "countOneBits";
default:
VECTOR_MAP_UNARY(uint, N, countbits, value);
}
diff --git a/tests/hlsl-intrinsic/countbits.slang b/tests/hlsl-intrinsic/countbits.slang
index da6828e87..060ad98f4 100644
--- a/tests/hlsl-intrinsic/countbits.slang
+++ b/tests/hlsl-intrinsic/countbits.slang
@@ -2,8 +2,10 @@
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx11
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -wgpu -compute
+// TODO: test GLSL pathway once emit-spirv-via-glsl is fixed and shader output reading is fixed for GLSL
//CHK:1
diff --git a/tests/hlsl-intrinsic/countbits16.slang b/tests/hlsl-intrinsic/countbits16.slang
new file mode 100644
index 000000000..dbfdc9217
--- /dev/null
+++ b/tests/hlsl-intrinsic/countbits16.slang
@@ -0,0 +1,47 @@
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
+//TODO: metal is currently failing even with emulation, investigate.
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute -profile metallib_2_4
+// No support for uint16_t on fxc - we need SM6.2 and dxil to use uint16_t with d3d12
+// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_2 -use-dxil -shaderobj -render-feature hardware-device
+// wgpu only has 32-bit support, so we do not try and test it here
+// TODO: test GLSL pathway once emit-spirv-via-glsl is fixed and shader output reading is fixed for GLSL
+
+//CHK:1
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint r1 = countbits(uint16_t(0b1U) << 8);
+ uint2 r2 = countbits(uint16_t2(uint16_t(0b0U) << 8, uint16_t(0b1U) << 8));
+ uint3 r3 = countbits(uint16_t3(uint16_t(0b0U) << 8, uint16_t(0b1U) << 8, uint16_t(0b11U) << 8));
+ uint4 r4 = countbits(uint16_t4(uint16_t(0b0U) << 8, uint16_t(0b1U) << 8, uint16_t(0b11U) << 8, uint16_t(0b111U) << 8));
+
+ uint r5 = countbits(int16_t(0b1) << 8);
+ uint2 r6 = countbits(int16_t2(int16_t(0b0) << 8, int16_t(0b1) << 8));
+ uint3 r7 = countbits(int16_t3(int16_t(0b0) << 8, int16_t(0b1) << 8, int16_t(0b11) << 8));
+ uint4 r8 = countbits(int16_t4(int16_t(0b0) << 8, int16_t(0b1) << 8, int16_t(0b11) << 8, int16_t(0b111) << 8));
+
+ uint16_t smallShiftU16 = uint16_t(0b111) << 16;
+ int16_t smallShiftI16 = int16_t(0b1111) << 16;
+
+ uint bitCountBigShiftU16 = countbits(smallShiftU16);
+ uint bitCountBigShiftI16 = countbits(smallShiftI16);
+
+ outputBuffer[0] = true
+ && (r1 == 1)
+ && (r2.x == 0 && r2.y == 1)
+ && (r3.x == 0 && r3.y == 1 && r3.z == 2)
+ && (r4.x == 0 && r4.y == 1 && r4.z == 2 && r4.w == 3)
+ && (r5 == 1)
+ && (r6.x == 0 && r6.y == 1)
+ && (r7.x == 0 && r7.y == 1 && r7.z == 2)
+ && (r8.x == 0 && r8.y == 1 && r8.z == 2 && r8.w == 3)
+ && (bitCountBigShiftU16 == 0 && bitCountBigShiftI16 == 0)
+ ;
+}
diff --git a/tests/hlsl-intrinsic/countbits64.slang b/tests/hlsl-intrinsic/countbits64.slang
index a24b31477..90799e411 100644
--- a/tests/hlsl-intrinsic/countbits64.slang
+++ b/tests/hlsl-intrinsic/countbits64.slang
@@ -1,10 +1,14 @@
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute -render-feature int64
+// emit-spirv-via-glsl is currently ignored, but even working around this, output does not appear to be captured for GLSL
+// No support for uint64_t in GLSL without an extension like GL_EXT_shader_explicit_arithmetic_types_int64
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -vk -compute -render-feature int64 -emit-spirv-via-glsl -profile GLSL_400 -Xslang... -capability GL_EXT_shader_explicit_arithmetic_types_int64.
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute
// No support for uint64_t on fxc - we need SM6.0 and dxil to use uint64_t with d3d12
// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/hlsl-shader-model-6-0-features-for-direct3d-12
//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -render-feature hardware-device
+// wgpu only has 32-bit support, so we do not try and test it here
//CHK:1
diff --git a/tests/hlsl-intrinsic/countbits8.slang b/tests/hlsl-intrinsic/countbits8.slang
new file mode 100644
index 000000000..1db8e805c
--- /dev/null
+++ b/tests/hlsl-intrinsic/countbits8.slang
@@ -0,0 +1,46 @@
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cpu
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -vk -compute
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -cuda -compute
+//TODO: metal is currently failing even with emulation, investigate.
+//DISABLE_TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -mtl -compute -profile metallib_2_4
+// Not testing the following:
+// -dx12/hlsl, No support for uint8_t with hlsl
+// -wgpu, only has 32-bit support
+// -vk/glsl, No support for uint8_t with glsl
+
+//CHK:1
+
+//TEST_INPUT:ubuffer(data=[0], stride=4):out,name=outputBuffer
+RWStructuredBuffer<uint> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ uint r1 = countbits(uint8_t(0b1U) << 4);
+ uint2 r2 = countbits(uint8_t2(uint8_t(0b0U) << 4, uint8_t(0b1U) << 4));
+ uint3 r3 = countbits(uint8_t3(uint8_t(0b0U) << 4, uint8_t(0b1U) << 4, uint8_t(0b11U) << 4));
+ uint4 r4 = countbits(uint8_t4(uint8_t(0b0U) << 4, uint8_t(0b1U) << 4, uint8_t(0b11U) << 4, uint8_t(0b111U) << 4));
+
+ uint r5 = countbits(int8_t(0b1) << 4);
+ uint2 r6 = countbits(int8_t2(int8_t(0b0) << 4, int8_t(0b1) << 4));
+ uint3 r7 = countbits(int8_t3(int8_t(0b0) << 4, int8_t(0b1) << 4, int8_t(0b11) << 4));
+ uint4 r8 = countbits(int8_t4(int8_t(0b0) << 4, int8_t(0b1) << 4, int8_t(0b11) << 4, int8_t(0b111) << 4));
+
+ uint8_t smallShiftU8 = uint8_t(0b111) << 8;
+ int8_t smallShiftI8 = int8_t(0b1111) << 8;
+
+ uint bitCountBigShiftU8 = countbits(smallShiftU8);
+ uint bitCountBigShiftI8 = countbits(smallShiftI8);
+
+ outputBuffer[0] = true
+ && (r1 == 1)
+ && (r2.x == 0 && r2.y == 1)
+ && (r3.x == 0 && r3.y == 1 && r3.z == 2)
+ && (r4.x == 0 && r4.y == 1 && r4.z == 2 && r4.w == 3)
+ && (r5 == 1)
+ && (r6.x == 0 && r6.y == 1)
+ && (r7.x == 0 && r7.y == 1 && r7.z == 2)
+ && (r8.x == 0 && r8.y == 1 && r8.z == 2 && r8.w == 3)
+ && (bitCountBigShiftU8 == 0 && bitCountBigShiftI8 == 0)
+ ;
+}