Initial partial support for WaveXXX intrinsics on CUDA (#1228)

* Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex
author: jsmall-nvidia <jsmall@nvidia.com> 2020-02-19 14:16:38 -0500
committer: GitHub <noreply@github.com> 2020-02-19 11:16:38 -0800
commit: 46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
tree: f6581768499187bf1572079d65b99c15a95e7743 /prelude
parent: 1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)
1 files changed, 36 insertions, 0 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 768e33d13..b81acba1e 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -7,6 +7,12 @@
 // For now we'll disable any asserts in this prelude
 #define SLANG_PRELUDE_ASSERT(x) 
 
+#ifndef SLANG_CUDA_WARP_SIZE 
+#   define SLANG_CUDA_WARP_SIZE 32
+#endif
+
+#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1)
+
 //
 #define SLANG_FORCE_INLINE inline
 
@@ -112,6 +118,36 @@ union Union64
     double d;
 };
 
+// ---------------------- Miscellaneous --------------------------------------
+
+// TODO(JS): It appears that cuda does not have a simple way to get a lane index. 
+// 
+// Another approach could be... 
+// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
+// If that is really true another way to do this, would be for code generator to add this function 
+// with the [numthreads] baked in. 
+// 
+// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
+#ifndef SLANG_USE_ASM_LANE_ID
+ __forceinline__ __device__ uint32_t _getLaneId()
+{
+    // If the launch is (or I guess some multiple of the warp size) 
+    // we try this mechanism, which is apparently faster. 
+    return threadIdx.x & SLANG_CUDA_WARP_MASK;
+}
+#else
+__forceinline__ __device__ uint32_t _getLaneId()
+{
+    // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
+    // This mechanism is not the fastest way to do it, and that is why the other mechanism 
+    // is the default. But the other mechanism relies on a launch that makes the assumption 
+    // true.
+    unsigned ret; 
+    asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
+    return ret;
+}
+#endif
+
 // ----------------------------- F32 -----------------------------------------
 
 // Unary
author	jsmall-nvidia <jsmall@nvidia.com>	2020-02-19 14:16:38 -0500
committer	GitHub <noreply@github.com>	2020-02-19 11:16:38 -0800
commit	46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
tree	f6581768499187bf1572079d65b99c15a95e7743 /prelude
parent	1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)