summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/cuda-target.md12
-rw-r--r--prelude/slang-cuda-prelude.h36
-rw-r--r--source/slang/hlsl.meta.slang16
-rw-r--r--source/slang/hlsl.meta.slang.h18
4 files changed, 81 insertions, 1 deletions
diff --git a/docs/cuda-target.md b/docs/cuda-target.md
index 743e9830c..a9b35d735 100644
--- a/docs/cuda-target.md
+++ b/docs/cuda-target.md
@@ -18,6 +18,7 @@ These limitations apply to Slang transpiling to CUDA.
* Samplers are not separate objects in CUDA - they are combined into a single 'TextureObject'. So samplers are effectively ignored on CUDA targets.
* Whilst there is tex1Dfetch there are no equivalents for higher dimensions - so such accesses are not currently supported
* When using a TextureArray (layered texture in CUDA) - the index will be treated as an int, as this is all CUDA allows
+* Care must be used in using `WaveGetLaneIndex` wave intrinsic - it will only give the right results for appopriate launches
The following are a work in progress or not implmented but are planned to be so in the future
@@ -143,6 +144,17 @@ void setDownstreamCompilerPrelude(SlangPassThrough passThrough, const char* prel
The code that sets up the prelude for the test infrastucture and command line usage can be found in ```TestToolUtil::setSessionDefaultPrelude```. Essentially this determines what the absolute path is to `slang-cpp-prelude.h` is and then just makes the prelude `#include "the absolute path"`.
+Limitations
+===========
+
+## WaveGetLaneIndex
+
+This defaults to `threadIdx.x & SLANG_CUDA_WARP_MASK`. Depending on how the kernel is launched this could be incorrect.
+
+There other ways to get lane index, for example using inline assembly. This mechanism though is apparently slower than the simple method used here.
+
+There is potential to calculate the lane id using the [numthreads] markup in Slang/HLSL, but that also requires some assumptions of how that maps to a lane index.
+
Language aspects
================
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 768e33d13..b81acba1e 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -7,6 +7,12 @@
// For now we'll disable any asserts in this prelude
#define SLANG_PRELUDE_ASSERT(x)
+#ifndef SLANG_CUDA_WARP_SIZE
+# define SLANG_CUDA_WARP_SIZE 32
+#endif
+
+#define SLANG_CUDA_WARP_MASK (SLANG_CUDA_WARP_SIZE - 1)
+
//
#define SLANG_FORCE_INLINE inline
@@ -112,6 +118,36 @@ union Union64
double d;
};
+// ---------------------- Miscellaneous --------------------------------------
+
+// TODO(JS): It appears that cuda does not have a simple way to get a lane index.
+//
+// Another approach could be...
+// laneId = ((threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x) & SLANG_CUDA_WARP_MASK
+// If that is really true another way to do this, would be for code generator to add this function
+// with the [numthreads] baked in.
+//
+// For now I'll just assume you have a launch that makes the following correct if the kernel uses WaveGetLaneIndex()
+#ifndef SLANG_USE_ASM_LANE_ID
+ __forceinline__ __device__ uint32_t _getLaneId()
+{
+ // If the launch is (or I guess some multiple of the warp size)
+ // we try this mechanism, which is apparently faster.
+ return threadIdx.x & SLANG_CUDA_WARP_MASK;
+}
+#else
+__forceinline__ __device__ uint32_t _getLaneId()
+{
+ // https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid#
+ // This mechanism is not the fastest way to do it, and that is why the other mechanism
+ // is the default. But the other mechanism relies on a launch that makes the assumption
+ // true.
+ unsigned ret;
+ asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
+ return ret;
+}
+#endif
+
// ----------------------------- F32 -----------------------------------------
// Unary
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 8fd2a272a..988c6f69c 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1431,14 +1431,30 @@ __generic<T : __BuiltinType> uint4 WaveMatch(T value);
__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
+// TODO(JS): For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()
+// seems to be appropriate.
+
+__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)")
bool WaveActiveAllTrue(bool condition);
+__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
bool WaveActiveAnyTrue(bool condition);
+__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
uint4 WaveActiveBallot(bool condition);
+
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0))")
uint WaveActiveCountBits(bool value);
+__target_intrinsic(cuda, "(warpSize)")
uint WaveGetLaneCount();
+
+__target_intrinsic(cuda, "_getLaneId()")
uint WaveGetLaneIndex();
+
+// If there are no *active* lanes less than this one, we must be the lowest lane
+__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
bool WaveIsFirstLane();
__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index c72374786..b39c4bbcb 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1507,14 +1507,30 @@ SLANG_RAW("__generic<T : __BuiltinType> uint4 WaveMatch(T value);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);\n")
SLANG_RAW("\n")
+SLANG_RAW("// TODO(JS): For CUDA the article claims mask has to be used carefully\n")
+SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n")
+SLANG_RAW("// With the Warp intrinsics there is though mask, and it's just the 'active lanes'. So __activemask()\n")
+SLANG_RAW("// seems to be appropriate.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n")
SLANG_RAW("uint WaveActiveCountBits(bool value);\n")
SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n")
SLANG_RAW("uint WaveGetLaneCount();\n")
+SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
SLANG_RAW("uint WaveGetLaneIndex();\n")
+SLANG_RAW("\n")
+SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
+SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
SLANG_RAW("bool WaveIsFirstLane();\n")
SLANG_RAW("\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
@@ -1642,7 +1658,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
sb << "};\n";
}
-SLANG_RAW("#line 1569 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("\n")