4 files changed, 81 insertions, 1 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md
index 743e9830c..a9b35d735 100644
--- a/docs/cuda-target.md
+++ b/docs/cuda-target.md
@@ -18,6 +18,7 @@ These limitations apply to Slang transpiling to CUDA.
 * Samplers are not separate objects in CUDA - they are combined into a single 'TextureObject'. So samplers are effectively ignored on CUDA targets. 
 * Whilst there is tex1Dfetch there are no equivalents for higher dimensions - so such accesses are not currently supported
 * When using a TextureArray (layered texture in CUDA) - the index will be treated as an int, as this is all CUDA allows
+* Care must be used in using `WaveGetLaneIndex` wave intrinsic - it will only give the right results for appopriate launches
 
 The following are a work in progress or not implmented but are planned to be so in the future
 
@@ -143,6 +144,17 @@ void setDownstreamCompilerPrelude(SlangPassThrough passThrough, const char* prel
 
 The code that sets up the prelude for the test infrastucture and command line usage can be found in ```TestToolUtil::setSessionDefaultPrelude```. Essentially this determines what the absolute path is to `slang-cpp-prelude.h` is and then just makes the prelude `#include "the absolute path"`.
 
+Limitations
+===========
+
+## WaveGetLaneIndex
+
+This defaults to `threadIdx.x & SLANG_CUDA_WARP_MASK`. Depending on how the kernel is launched this could be incorrect. 
+
+There other ways to get lane index, for example using inline assembly. This mechanism though is apparently slower than the simple method used here. 
+
+There is potential to calculate the lane id using the [numthreads] markup in Slang/HLSL, but that also requires some assumptions of how that maps to a lane index. 
+
 Language aspects
 ================
 
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 768e33d13..b81acba1e 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -7,6 +7,12 @@
 // For now we'll disable any asserts in this prelude
 #define SLANG_PRELUDE_ASSERT(x) 
 
+#ifndef SLANG_CUDA_WARP_SIZE 
+#   define SLANG_CUDA_WARP_SIZE 32
+#endif
+
+#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1)
+
 //
 #define SLANG_FORCE_INLINE inline
 
@@ -112,6 +118,36 @@ union Union64
     double d;
 };
 
+// ---------------------- Miscellaneous --------------------------------------
+
+// TODO(JS): It appears that cuda does not have a simple way to get a lane index. 
+// 
+// Another approach could be... 
+// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
+// If that is really true another way to do this, would be for code generator to add this function 
+// with the [numthreads] baked in. 
+// 
+// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
+#ifndef SLANG_USE_ASM_LANE_ID
+ __forceinline__ __device__ uint32_t _getLaneId()
+{
+    // If the launch is (or I guess some multiple of the warp size) 
+    // we try this mechanism, which is apparently faster. 
+    return threadIdx.x & SLANG_CUDA_WARP_MASK;
+}
+#else
+__forceinline__ __device__ uint32_t _getLaneId()
+{
+    // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
+    // This mechanism is not the fastest way to do it, and that is why the other mechanism 
+    // is the default. But the other mechanism relies on a launch that makes the assumption 
+    // true.
+    unsigned ret; 
+    asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
+    return ret;
+}
+#endif
+
 // ----------------------------- F32 -----------------------------------------
 
 // Unary 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
 __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
 
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)") 
 bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
 bool WaveActiveAnyTrue(bool condition);
 
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
 uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
 uint WaveActiveCountBits(bool value);
 
+__target_intrinsic(cuda, "(warpSize)")
 uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
 uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
 bool WaveIsFirstLane();
 
 __generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index c72374786..b39c4bbcb 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n")
 SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n")
 SLANG_RAW("\n")
+SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n")
+SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n")
+SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n")
+SLANG_RAW("// seems to be appropriate.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
 SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
 SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
 SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n")
 SLANG_RAW("uint WaveActiveCountBits(bool value);\n")
 SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n")
 SLANG_RAW("uint WaveGetLaneCount();\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
 SLANG_RAW("uint WaveGetLaneIndex();\n")
+SLANG_RAW("\n")
+SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
+SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
 SLANG_RAW("bool WaveIsFirstLane();\n")
 SLANG_RAW("\n")
 SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
@@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
 
     sb << "};\n";
 }
-SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
 SLANG_RAW("\n")
 SLANG_RAW("\n")
 SLANG_RAW("\n")