From 54acb11b1c8b6af2504ff3a3e0f56ca8baba4753 Mon Sep 17 00:00:00 2001
From: Mukund Keshava <mkeshava@nvidia.com>
Date: Wed, 30 Apr 2025 17:50:50 +0530
Subject: cuda: Improve entry handling for SV_DispatchThreadID (#6925)

* cuda: Improve entry handling for SV_DispatchThreadID

Fixes #6780

This commit improves CUDA entry point handling by extracting appropriate
components from DispatchThreadID based on parameter type. It now
properly handles uint scalar (x component only) and uint2 vector (x,y
components) instead of always using the full uint3 value.
Add a new test case to check for this.

* format code

* fix CI failure

* Handle review comments

---------

Co-authored-by: slangbot <186143334+slangbot@users.noreply.github.com>
Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
---
 tests/cuda/dispatch-thread-id-extraction.slang | 48 ++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 tests/cuda/dispatch-thread-id-extraction.slang

(limited to 'tests')
diff --git a/tests/cuda/dispatch-thread-id-extraction.slang b/tests/cuda/dispatch-thread-id-extraction.slang
new file mode 100644
index 000000000..5fc3c89a6
--- /dev/null
+++ b/tests/cuda/dispatch-thread-id-extraction.slang
@@ -0,0 +1,48 @@
+//TEST:SIMPLE(filecheck=CHECK): -target cuda -line-directive-mode none
+
+// This test verifies that DispatchThreadID parameter of different types 
+// correctly extracts components from the underlying uint3 value in CUDA.
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0], stride=4):out,name cudaOutputBuffer
+RWStructuredBuffer<float> cudaOutputBuffer;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void computeMain(uint tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWStructuredBuffer<uint> dst)
+{
+    dst[tid.x] = src[tid.x];
+}
+// CHECK: uint _S1 = (blockIdx * blockDim + threadIdx).x;
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void computeMain2(uint2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWStructuredBuffer<uint> dst)
+{
+    dst[tid.x] = src[tid.y];
+}
+// CHECK: uint2  _S2 = uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y};
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void computeMain3(int2 tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWStructuredBuffer<uint> dst)
+{
+    dst[tid.x] = src[tid.x];
+}
+// CHECK: int _S3 = (slang_bit_cast<int2 >(uint2 {(blockIdx * blockDim + threadIdx).x, (blockIdx * blockDim + threadIdx).y})).x;
+
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void computeMain4(int tid: SV_DispatchThreadID, StructuredBuffer<uint> src, RWStructuredBuffer<uint> dst)
+{
+    dst[tid.x] = src[tid.x];
+}
+// CHECK: int _S4 = (slang_bit_cast<int>((blockIdx * blockDim + threadIdx).x));
+
+[shader("compute")]
+[numthreads(1, 1, 1)]
+void computeMain5(int tid: SV_GroupIndex, StructuredBuffer<uint> src, RWStructuredBuffer<uint> dst)
+{
+    dst[tid.x] = src[tid.x];
+}
+// CHECK: int _S5 = (slang_bit_cast<int>((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x));
\ No newline at end of file
-- 
cgit v1.2.3