summaryrefslogtreecommitdiffstats
path: root/prelude
diff options
context:
space:
mode:
authorHarsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com>2025-09-04 10:58:02 +0530
committerGitHub <noreply@github.com>2025-09-04 05:28:02 +0000
commit5ec41675d817f82a7ce3c4d79c68548db0bd4227 (patch)
tree57abff17713b5d9ea876be29e3b451c9abe8c49d /prelude
parentb45706b3f532f85525de5746f1f607ba2e57fc88 (diff)
Enable CUDA support for additional HLSL intrinsic tests (#8293)
Enable CUDA support for additional HLSL intrinsic tests by implementing missing functionality and fixing compiler bugs affecting CUDA targets. - Fix critical bug in InterlockedCompareStore64 where division used /4 instead of /8 for 64-bit types, causing incorrect memory addressing for all signed int 64_t atomics - Add signed int64_t atomic wrappers (atomicExch, atomicCAS) to CUDA prelu de that properly cast to/from unsigned types as required by CUDA's atomic API - Enable tests: atomic-intrinsics-64bit.slang - Implement CUDA support for QuadAny and QuadAll operations using warp shu ffle primitives (__shfl_sync with quad-level lane masking) - Add CUDA to quad_control capability definition in slang-capabilities.capdef - Add _slang_quadAny/_slang_quadAll helper functions to CUDA prelude - Enable tests: quad-control-comp-functionality.slang, subgroup-quad.slang --------- Co-authored-by: szihs <675653+szihs@users.noreply.github.com>
Diffstat (limited to 'prelude')
-rw-r--r--prelude/slang-cuda-prelude.h238
1 files changed, 237 insertions, 1 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 44afd71b9..062e0ca6c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2402,7 +2402,32 @@ struct ByteAddressBuffer
};
// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer
-// Missing support for Atomic operations
+// Atomic operations support
+
+// Signed 64-bit atomic wrappers
+// CUDA only supports unsigned long long atomics, so we cast signed to unsigned
+__device__ __forceinline__ long long atomicExch(long long* address, long long val)
+{
+ return (long long)atomicExch((unsigned long long*)address, (unsigned long long)val);
+}
+
+__device__ __forceinline__ long long atomicCAS(long long* address, long long compare, long long val)
+{
+ return (long long)atomicCAS(
+ (unsigned long long*)address,
+ (unsigned long long)compare,
+ (unsigned long long)val);
+}
+
+// Float bitwise atomic compare-and-swap
+// Uses integer atomics to preserve exact float bit patterns
+__device__ __forceinline__ float atomicCAS(float* address, float compare, float val)
+{
+ int* addr_as_int = (int*)address;
+ int old = atomicCAS(addr_as_int, __float_as_int(compare), __float_as_int(val));
+ return __int_as_float(old);
+}
+
// Missing support for Load with status
struct RWByteAddressBuffer
{
@@ -4715,3 +4740,214 @@ tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
: "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
return make_int4(result_x, result_y, result_z, result_w);
}
+
+// Wave rotate helper functions - templated approach
+#define SLANG_WARP_FULL_MASK 0xFFFFFFFF
+
+// Macro-based wave rotate implementation following codebase patterns
+#define SLANG_WAVE_ROTATE_IMPL(T) \
+ __device__ __forceinline__ T##2 _slang_waveRotate(T##2 value, unsigned int delta) \
+ { \
+ return make_##T##2( \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.x, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.y, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \
+ } \
+ __device__ __forceinline__ T##3 _slang_waveRotate(T##3 value, unsigned int delta) \
+ { \
+ return make_##T##3( \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.x, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.y, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.z, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \
+ } \
+ __device__ __forceinline__ T##4 _slang_waveRotate(T##4 value, unsigned int delta) \
+ { \
+ return make_##T##4( \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.x, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.y, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.z, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.w, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \
+ }
+
+// Generate wave rotate functions for all standard vector types
+SLANG_WAVE_ROTATE_IMPL(uint)
+SLANG_WAVE_ROTATE_IMPL(int)
+SLANG_WAVE_ROTATE_IMPL(float)
+SLANG_WAVE_ROTATE_IMPL(short)
+SLANG_WAVE_ROTATE_IMPL(ushort)
+SLANG_WAVE_ROTATE_IMPL(char)
+SLANG_WAVE_ROTATE_IMPL(uchar)
+SLANG_WAVE_ROTATE_IMPL(longlong)
+SLANG_WAVE_ROTATE_IMPL(ulonglong)
+
+#ifdef SLANG_CUDA_ENABLE_HALF
+SLANG_WAVE_ROTATE_IMPL(__half)
+#endif
+
+// Special handling for boolean vectors (requires int conversion)
+__device__ __forceinline__ bool2 _slang_waveRotate(bool2 value, unsigned int delta)
+{
+ int2 intValue = make_int2((int)value.x, (int)value.y);
+ int2 result = _slang_waveRotate(intValue, delta);
+ return make_bool2((bool)result.x, (bool)result.y);
+}
+
+__device__ __forceinline__ bool3 _slang_waveRotate(bool3 value, unsigned int delta)
+{
+ int3 intValue = make_int3((int)value.x, (int)value.y, (int)value.z);
+ int3 result = _slang_waveRotate(intValue, delta);
+ return make_bool3((bool)result.x, (bool)result.y, (bool)result.z);
+}
+
+__device__ __forceinline__ bool4 _slang_waveRotate(bool4 value, unsigned int delta)
+{
+ int4 intValue = make_int4((int)value.x, (int)value.y, (int)value.z, (int)value.w);
+ int4 result = _slang_waveRotate(intValue, delta);
+ return make_bool4((bool)result.x, (bool)result.y, (bool)result.z, (bool)result.w);
+}
+
+#undef SLANG_WAVE_ROTATE_IMPL
+
+// Quad control operations for CUDA
+__device__ __forceinline__ bool _slang_quadAny(bool expr)
+{
+ // Get values from all 4 lanes in the quad
+ bool v0 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 0);
+ bool v1 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 1);
+ bool v2 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 2);
+ bool v3 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 3);
+ return v0 || v1 || v2 || v3;
+}
+
+__device__ __forceinline__ bool _slang_quadAll(bool expr)
+{
+ // Get values from all 4 lanes in the quad
+ bool v0 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 0);
+ bool v1 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 1);
+ bool v2 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 2);
+ bool v3 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 3);
+ return v0 && v1 && v2 && v3;
+}
+
+// Clustered wave rotate operations for CUDA
+// Clustered rotate rotates values within clusters of specified size
+#define SLANG_WAVE_CLUSTERED_ROTATE_IMPL(T) \
+ __device__ __forceinline__ T \
+ _slang_waveClusteredRotate(T value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return __shfl_sync(SLANG_WARP_FULL_MASK, value, targetLane); \
+ } \
+ __device__ __forceinline__ \
+ T##2 _slang_waveClusteredRotate(T##2 value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return make_##T##2( \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane)); \
+ } \
+ __device__ __forceinline__ \
+ T##3 _slang_waveClusteredRotate(T##3 value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return make_##T##3( \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.z, targetLane)); \
+ } \
+ __device__ __forceinline__ \
+ T##4 _slang_waveClusteredRotate(T##4 value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return make_##T##4( \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.z, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.w, targetLane)); \
+ }
+
+// Generate clustered wave rotate functions for all standard types
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(uint)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(int)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(float)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(short)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(ushort)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(char)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(uchar)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(longlong)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(ulonglong)
+
+#ifdef SLANG_CUDA_ENABLE_HALF
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(__half)
+#endif
+
+// Special handling for boolean clustered rotate
+__device__ __forceinline__ bool _slang_waveClusteredRotate(
+ bool value,
+ unsigned int delta,
+ unsigned int clusterSize)
+{
+ int intValue = (int)value;
+ int result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return (bool)result;
+}
+
+__device__ __forceinline__ bool2
+_slang_waveClusteredRotate(bool2 value, unsigned int delta, unsigned int clusterSize)
+{
+ int2 intValue = make_int2((int)value.x, (int)value.y);
+ int2 result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return make_bool2((bool)result.x, (bool)result.y);
+}
+
+__device__ __forceinline__ bool3
+_slang_waveClusteredRotate(bool3 value, unsigned int delta, unsigned int clusterSize)
+{
+ int3 intValue = make_int3((int)value.x, (int)value.y, (int)value.z);
+ int3 result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return make_bool3((bool)result.x, (bool)result.y, (bool)result.z);
+}
+
+__device__ __forceinline__ bool4
+_slang_waveClusteredRotate(bool4 value, unsigned int delta, unsigned int clusterSize)
+{
+ int4 intValue = make_int4((int)value.x, (int)value.y, (int)value.z, (int)value.w);
+ int4 result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return make_bool4((bool)result.x, (bool)result.y, (bool)result.z, (bool)result.w);
+}
+
+#undef SLANG_WAVE_CLUSTERED_ROTATE_IMPL