From dcda42e7dcdb5e260013757763bf5dbf67d69568 Mon Sep 17 00:00:00 2001
From: "James Helferty (NVIDIA)" <jhelferty@nvidia.com>
Date: Fri, 15 Aug 2025 09:21:48 -0700
Subject: Use 64bit int instead of emulation on metal (#8180)

Metal's popcount prototype is `T popcount(T x)` but we want to use it to
implement `countbits` where the prototype always returns `uint`.

Using `popcount` directly would implicitly cast successfully to the
32-bit return value in all cases except when the argument is a 64-bit
type. Thus, this change always explicitly casts the result to `$TR`,
which should be one of the `uint[N]` types, and should always be able to
hold the number of bits in the type.

Addresses #6877
---
 source/slang/hlsl.meta.slang             | 24 ++----------------------
 tests/hlsl-intrinsic/scalar-int64.slang  |  3 ++-
 tests/hlsl-intrinsic/scalar-uint64.slang |  3 ++-
 3 files changed, 6 insertions(+), 24 deletions(-)
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 78f6a4eb8..0d5b8cb1f 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -8190,19 +8190,7 @@ uint countbits(T value)
             __intrinsic_asm "bitCount";
         }
     case metal:
-        if (T is int64_t || T  is uint64_t)
-        {
-            return __emulatedCountbits64(__intCast<uint64_t>(value));
-        }
-        else if (T is int16_t || T  is uint16_t)
-        {
-            // emulate 16-bit
-            return countbits(__intCast<uint32_t>(value));
-        }
-        else
-        {
-            __intrinsic_asm "popcount";
-        }
+        __intrinsic_asm "($TR)popcount($0)";
     case cuda:
     case cpp:
         __intrinsic_asm "$P_countbits($0)";
@@ -8262,15 +8250,7 @@ vector<uint, N> countbits(vector<T, N> value)
             __intrinsic_asm "bitCount";
         }
     case metal:
-        if(T is int64_t || T  is uint64_t || T is int16_t || T  is uint16_t)
-        {
-            // Emulate 64-bit and 16-bit
-            VECTOR_MAP_UNARY(uint, N, countbits, value);
-        }
-        else
-        {
-            __intrinsic_asm "popcount";
-        }
+        __intrinsic_asm "($TR)popcount($0)";
     case spirv:
         if(T is int64_t || T  is uint64_t || T is int16_t || T  is uint16_t)
         {
diff --git a/tests/hlsl-intrinsic/scalar-int64.slang b/tests/hlsl-intrinsic/scalar-int64.slang
index f4518f198..f029d5da8 100644
--- a/tests/hlsl-intrinsic/scalar-int64.slang
+++ b/tests/hlsl-intrinsic/scalar-int64.slang
@@ -6,6 +6,7 @@
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -profile cs_6_0 -dx12 -use-dxil -shaderobj -render-feature hardware-device
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature int64
 //TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
+//TEST(compute, metal):COMPARE_COMPUTE_EX:-slang -compute -mtl
 
 //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
 RWStructuredBuffer<int> outputBuffer;
@@ -38,4 +39,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     int64_t v = (ti * 0x400010035435435ll) / 3ll + 7ll - 9ll; 
     
     outputBuffer[uint(idx)] = int(v) ^ int(((v >> 32) & 0xffffffff)); 
-}
\ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/scalar-uint64.slang b/tests/hlsl-intrinsic/scalar-uint64.slang
index f75dd8acc..e790452a6 100644
--- a/tests/hlsl-intrinsic/scalar-uint64.slang
+++ b/tests/hlsl-intrinsic/scalar-uint64.slang
@@ -7,6 +7,7 @@
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -profile cs_6_0 -use-dxil -shaderobj -render-feature hardware-device
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature int64
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
+//TEST(compute, metal):COMPARE_COMPUTE_EX:-slang -compute -mtl
 
 //TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name outputBuffer
 RWStructuredBuffer<int> outputBuffer;
@@ -44,4 +45,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
     v = max(u, v);
       
     outputBuffer[dispatchThreadID.x] = int(v) ^ int(v >> 32); 
-}
\ No newline at end of file
+}
-- 
cgit v1.2.3