summaryrefslogtreecommitdiffstats
path: root/prelude
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-02-19 14:16:38 -0500
committerGitHub <noreply@github.com>2020-02-19 11:16:38 -0800
commit46a1b5f58a528bf1cd2fa2907234a9090cd7ba10 (patch)
treef6581768499187bf1572079d65b99c15a95e7743 /prelude
parent1d9152bd2d0b1234680ce6a9f7ef940d7f179e9a (diff)
Initial partial support for WaveXXX intrinsics on CUDA (#1228)
* Start work on wave intrinsics for CUDA. * Add prelimary CUDA support for some Wave intrinsics. Document the issue around WaveGetLaneIndex
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h36
1 files changed, 36 insertions, 0 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 768e33d13..b81acba1e 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -7,6 +7,12 @@
// For now we'll disable any asserts in this prelude
#define SLANG_PRELUDE_ASSERT(x)
+#ifndef SLANG_CUDA_WARP_SIZE
+# define SLANG_CUDA_WARP_SIZE 32
+#endif
+
+#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1)
+
//
#define SLANG_FORCE_INLINE inline
@@ -112,6 +118,36 @@ union Union64
double d;
};
+// ---------------------- Miscellaneous --------------------------------------
+
+// TODO(JS): It appears that cuda does not have a simple way to get a lane index.
+//
+// Another approach could be...
+// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
+// If that is really true another way to do this, would be for code generator to add this function
+// with the [numthreads] baked in.
+//
+// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
+#ifndef SLANG_USE_ASM_LANE_ID
+ __forceinline__ __device__ uint32_t _getLaneId()
+{
+ // If the launch is (or I guess some multiple of the warp size)
+ // we try this mechanism, which is apparently faster.
+ return threadIdx.x & SLANG_CUDA_WARP_MASK;
+}
+#else
+__forceinline__ __device__ uint32_t _getLaneId()
+{
+ // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
+ // This mechanism is not the fastest way to do it, and that is why the other mechanism
+ // is the default. But the other mechanism relies on a launch that makes the assumption
+ // true.
+ unsigned ret;
+ asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
+ return ret;
+}
+#endif
+
// ----------------------------- F32 -----------------------------------------
// Unary