summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com>2025-09-04 10:58:02 +0530
committerGitHub <noreply@github.com>2025-09-04 05:28:02 +0000
commit5ec41675d817f82a7ce3c4d79c68548db0bd4227 (patch)
tree57abff17713b5d9ea876be29e3b451c9abe8c49d
parentb45706b3f532f85525de5746f1f607ba2e57fc88 (diff)
Enable CUDA support for additional HLSL intrinsic tests (#8293)
Enable CUDA support for additional HLSL intrinsic tests by implementing missing functionality and fixing compiler bugs affecting CUDA targets. - Fix critical bug in InterlockedCompareStore64 where division used /4 instead of /8 for 64-bit types, causing incorrect memory addressing for all signed int 64_t atomics - Add signed int64_t atomic wrappers (atomicExch, atomicCAS) to CUDA prelu de that properly cast to/from unsigned types as required by CUDA's atomic API - Enable tests: atomic-intrinsics-64bit.slang - Implement CUDA support for QuadAny and QuadAll operations using warp shu ffle primitives (__shfl_sync with quad-level lane masking) - Add CUDA to quad_control capability definition in slang-capabilities.capdef - Add _slang_quadAny/_slang_quadAll helper functions to CUDA prelude - Enable tests: quad-control-comp-functionality.slang, subgroup-quad.slang --------- Co-authored-by: szihs <675653+szihs@users.noreply.github.com>
-rw-r--r--prelude/slang-cuda-prelude.h238
-rw-r--r--source/slang/hlsl.meta.slang93
-rw-r--r--source/slang/slang-capabilities.capdef2
-rw-r--r--tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang11
-rw-r--r--tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang1
-rw-r--r--tests/hlsl-intrinsic/subgroup-quad.slang10
-rw-r--r--tests/hlsl-intrinsic/texture-2d-gather.slang58
-rw-r--r--tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang1
-rw-r--r--tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang1
9 files changed, 384 insertions, 31 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 44afd71b9..062e0ca6c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2402,7 +2402,32 @@ struct ByteAddressBuffer
};
// https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sm5-object-rwbyteaddressbuffer
-// Missing support for Atomic operations
+// Atomic operations support
+
+// Signed 64-bit atomic wrappers
+// CUDA only supports unsigned long long atomics, so we cast signed to unsigned
+__device__ __forceinline__ long long atomicExch(long long* address, long long val)
+{
+ return (long long)atomicExch((unsigned long long*)address, (unsigned long long)val);
+}
+
+__device__ __forceinline__ long long atomicCAS(long long* address, long long compare, long long val)
+{
+ return (long long)atomicCAS(
+ (unsigned long long*)address,
+ (unsigned long long)compare,
+ (unsigned long long)val);
+}
+
+// Float bitwise atomic compare-and-swap
+// Uses integer atomics to preserve exact float bit patterns
+__device__ __forceinline__ float atomicCAS(float* address, float compare, float val)
+{
+ int* addr_as_int = (int*)address;
+ int old = atomicCAS(addr_as_int, __float_as_int(compare), __float_as_int(val));
+ return __int_as_float(old);
+}
+
// Missing support for Load with status
struct RWByteAddressBuffer
{
@@ -4715,3 +4740,214 @@ tex2DArrayfetch_int(CUtexObject texObj, int x, int y, int layer)
: "l"(texObj), "r"(x), "r"(y), "r"(layer), "r"(layer));
return make_int4(result_x, result_y, result_z, result_w);
}
+
+// Wave rotate helper functions - templated approach
+#define SLANG_WARP_FULL_MASK 0xFFFFFFFF
+
+// Macro-based wave rotate implementation following codebase patterns
+#define SLANG_WAVE_ROTATE_IMPL(T) \
+ __device__ __forceinline__ T##2 _slang_waveRotate(T##2 value, unsigned int delta) \
+ { \
+ return make_##T##2( \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.x, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.y, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \
+ } \
+ __device__ __forceinline__ T##3 _slang_waveRotate(T##3 value, unsigned int delta) \
+ { \
+ return make_##T##3( \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.x, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.y, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.z, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \
+ } \
+ __device__ __forceinline__ T##4 _slang_waveRotate(T##4 value, unsigned int delta) \
+ { \
+ return make_##T##4( \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.x, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.y, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.z, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE), \
+ (T)__shfl_sync( \
+ SLANG_WARP_FULL_MASK, \
+ value.w, \
+ (_getLaneId() + delta) % SLANG_CUDA_WARP_SIZE)); \
+ }
+
+// Generate wave rotate functions for all standard vector types
+SLANG_WAVE_ROTATE_IMPL(uint)
+SLANG_WAVE_ROTATE_IMPL(int)
+SLANG_WAVE_ROTATE_IMPL(float)
+SLANG_WAVE_ROTATE_IMPL(short)
+SLANG_WAVE_ROTATE_IMPL(ushort)
+SLANG_WAVE_ROTATE_IMPL(char)
+SLANG_WAVE_ROTATE_IMPL(uchar)
+SLANG_WAVE_ROTATE_IMPL(longlong)
+SLANG_WAVE_ROTATE_IMPL(ulonglong)
+
+#ifdef SLANG_CUDA_ENABLE_HALF
+SLANG_WAVE_ROTATE_IMPL(__half)
+#endif
+
+// Special handling for boolean vectors (requires int conversion)
+__device__ __forceinline__ bool2 _slang_waveRotate(bool2 value, unsigned int delta)
+{
+ int2 intValue = make_int2((int)value.x, (int)value.y);
+ int2 result = _slang_waveRotate(intValue, delta);
+ return make_bool2((bool)result.x, (bool)result.y);
+}
+
+__device__ __forceinline__ bool3 _slang_waveRotate(bool3 value, unsigned int delta)
+{
+ int3 intValue = make_int3((int)value.x, (int)value.y, (int)value.z);
+ int3 result = _slang_waveRotate(intValue, delta);
+ return make_bool3((bool)result.x, (bool)result.y, (bool)result.z);
+}
+
+__device__ __forceinline__ bool4 _slang_waveRotate(bool4 value, unsigned int delta)
+{
+ int4 intValue = make_int4((int)value.x, (int)value.y, (int)value.z, (int)value.w);
+ int4 result = _slang_waveRotate(intValue, delta);
+ return make_bool4((bool)result.x, (bool)result.y, (bool)result.z, (bool)result.w);
+}
+
+#undef SLANG_WAVE_ROTATE_IMPL
+
+// Quad control operations for CUDA
+__device__ __forceinline__ bool _slang_quadAny(bool expr)
+{
+ // Get values from all 4 lanes in the quad
+ bool v0 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 0);
+ bool v1 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 1);
+ bool v2 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 2);
+ bool v3 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 3);
+ return v0 || v1 || v2 || v3;
+}
+
+__device__ __forceinline__ bool _slang_quadAll(bool expr)
+{
+ // Get values from all 4 lanes in the quad
+ bool v0 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 0);
+ bool v1 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 1);
+ bool v2 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 2);
+ bool v3 = __shfl_sync(0xFFFFFFFF, expr, (_getLaneId() & 0xFFFFFFFC) | 3);
+ return v0 && v1 && v2 && v3;
+}
+
+// Clustered wave rotate operations for CUDA
+// Clustered rotate rotates values within clusters of specified size
+#define SLANG_WAVE_CLUSTERED_ROTATE_IMPL(T) \
+ __device__ __forceinline__ T \
+ _slang_waveClusteredRotate(T value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return __shfl_sync(SLANG_WARP_FULL_MASK, value, targetLane); \
+ } \
+ __device__ __forceinline__ \
+ T##2 _slang_waveClusteredRotate(T##2 value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return make_##T##2( \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane)); \
+ } \
+ __device__ __forceinline__ \
+ T##3 _slang_waveClusteredRotate(T##3 value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return make_##T##3( \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.z, targetLane)); \
+ } \
+ __device__ __forceinline__ \
+ T##4 _slang_waveClusteredRotate(T##4 value, unsigned int delta, unsigned int clusterSize) \
+ { \
+ unsigned int laneId = _getLaneId(); \
+ unsigned int clusterStart = (laneId / clusterSize) * clusterSize; \
+ unsigned int targetLane = clusterStart + ((laneId - clusterStart + delta) % clusterSize); \
+ return make_##T##4( \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.x, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.y, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.z, targetLane), \
+ (T)__shfl_sync(SLANG_WARP_FULL_MASK, value.w, targetLane)); \
+ }
+
+// Generate clustered wave rotate functions for all standard types
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(uint)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(int)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(float)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(short)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(ushort)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(char)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(uchar)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(longlong)
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(ulonglong)
+
+#ifdef SLANG_CUDA_ENABLE_HALF
+SLANG_WAVE_CLUSTERED_ROTATE_IMPL(__half)
+#endif
+
+// Special handling for boolean clustered rotate
+__device__ __forceinline__ bool _slang_waveClusteredRotate(
+ bool value,
+ unsigned int delta,
+ unsigned int clusterSize)
+{
+ int intValue = (int)value;
+ int result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return (bool)result;
+}
+
+__device__ __forceinline__ bool2
+_slang_waveClusteredRotate(bool2 value, unsigned int delta, unsigned int clusterSize)
+{
+ int2 intValue = make_int2((int)value.x, (int)value.y);
+ int2 result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return make_bool2((bool)result.x, (bool)result.y);
+}
+
+__device__ __forceinline__ bool3
+_slang_waveClusteredRotate(bool3 value, unsigned int delta, unsigned int clusterSize)
+{
+ int3 intValue = make_int3((int)value.x, (int)value.y, (int)value.z);
+ int3 result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return make_bool3((bool)result.x, (bool)result.y, (bool)result.z);
+}
+
+__device__ __forceinline__ bool4
+_slang_waveClusteredRotate(bool4 value, unsigned int delta, unsigned int clusterSize)
+{
+ int4 intValue = make_int4((int)value.x, (int)value.y, (int)value.z, (int)value.w);
+ int4 result = _slang_waveClusteredRotate(intValue, delta, clusterSize);
+ return make_bool4((bool)result.x, (bool)result.y, (bool)result.z, (bool)result.w);
+}
+
+#undef SLANG_WAVE_CLUSTERED_ROTATE_IMPL
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 2af0dbcf7..d5f8cd2e1 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -3480,23 +3480,31 @@ ${{{{
const char* componentArg = (isShadow ? "" : componentArgString[componentId]);
}}}}
[ForceInline]
- [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)]
+ [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)]
vector<T.Element,4> Gather$(compareFunc)$(componentFunc)(
$(samplerParam)
vector<float, Shape.dimensions+isArray> location
$(compareParam))
{
- static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
- "Gather is supported only for 2D and 3D textures");
-
__target_switch
{
- case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case hlsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
+ __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case cuda:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures");
+ static_assert(isArray == 0, "CUDA Gather does not support texture arrays");
+ __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))";
case metal:
case wgsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)($(getTexture) $(getSampler), location $(compareArg) $(componentArg));
case glsl:
case spirv:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)(this $(samplerArg), location $(compareArg) $(componentArg));
}
}
@@ -3519,24 +3527,33 @@ ${{{{
}
[ForceInline]
- [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)]
+ [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)]
vector<T.Element,4> Gather$(compareFunc)$(componentFunc)(
$(samplerParam)
vector<float, Shape.dimensions+isArray> location
$(compareParam),
constexpr vector<int, Shape.planeDimensions> offset)
{
- static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
- "Gather is supported only for 2D and 3D textures");
-
__target_switch
{
- case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case hlsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
+ __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case cuda:
+ // CUDA tex2Dgather doesn't support offset - ignore offset parameter
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures");
+ static_assert(isArray == 0, "CUDA Gather does not support texture arrays");
+ __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))";
case metal:
case wgsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)_offset($(getTexture) $(getSampler), location $(compareArg), offset $(componentArg));
case glsl:
case spirv:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)_offset(this $(samplerArg), location $(compareArg), offset $(componentArg));
}
}
@@ -5704,7 +5721,7 @@ ${{{{
case hlsl: __intrinsic_asm ".InterlockedCompareStore64";
default:
let buf = __getEquivalentStructuredBuffer<T>(this);
- __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value);
+ __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value);
return;
}
}
@@ -14917,7 +14934,7 @@ matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr)
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
-[require(glsl_hlsl_metal_spirv, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)]
T QuadReadLaneAt(T sourceValue, uint quadLaneID)
{
__target_switch
@@ -14934,12 +14951,14 @@ T QuadReadLaneAt(T sourceValue, uint quadLaneID)
OpCapability GroupNonUniformQuad;
result:$$T = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID;
};
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))";
}
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
-[require(glsl_hlsl_metal_spirv, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)]
vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID)
{
__target_switch
@@ -14955,6 +14974,8 @@ vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID)
OpCapability GroupNonUniformQuad;
result:$$vector<T,N> = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID;
};
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID);
@@ -14965,7 +14986,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
T QuadReadAcrossX(T localValue)
{
__target_switch
@@ -14982,6 +15003,8 @@ T QuadReadAcrossX(T localValue)
result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapX";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)";
}
}
@@ -14990,7 +15013,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
vector<T,N> QuadReadAcrossX(vector<T,N> localValue)
{
__target_switch
@@ -15007,6 +15030,8 @@ vector<T,N> QuadReadAcrossX(vector<T,N> localValue)
result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapX";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossX(matrix<T,N,M> localValue);
@@ -15017,7 +15042,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
T QuadReadAcrossY(T localValue)
{
__target_switch
@@ -15034,6 +15059,8 @@ T QuadReadAcrossY(T localValue)
result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapY";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)";
}
}
__generic<T : __BuiltinType, let N : int>
@@ -15041,7 +15068,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
vector<T,N> QuadReadAcrossY(vector<T,N> localValue)
{
__target_switch
@@ -15058,6 +15085,8 @@ vector<T,N> QuadReadAcrossY(vector<T,N> localValue)
result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapY";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossY(matrix<T,N,M> localValue);
@@ -15068,7 +15097,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
T QuadReadAcrossDiagonal(T localValue)
{
__target_switch
@@ -15085,6 +15114,8 @@ T QuadReadAcrossDiagonal(T localValue)
result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapDiagonal";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)";
}
}
__generic<T : __BuiltinType, let N : int>
@@ -15092,7 +15123,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue)
{
__target_switch
@@ -15109,6 +15140,8 @@ vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue)
result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapDiagonal";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);
@@ -16300,7 +16333,7 @@ void shader_subgroup_preamble()
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_metal_spirv, subgroup_rotate)]
+[require(cuda_glsl_metal_spirv, subgroup_rotate)]
T WaveRotate(T value, uint delta)
{
shader_subgroup_preamble<T>();
@@ -16308,6 +16341,8 @@ T WaveRotate(T value, uint delta)
{
case glsl:
__intrinsic_asm "subgroupRotate";
+ case cuda:
+ __intrinsic_asm "__shfl_sync(0xFFFFFFFF, $0, (_getLaneId() + $1) % 32)";
case metal:
__intrinsic_asm "simd_shuffle_rotate_down";
case spirv:
@@ -16322,7 +16357,7 @@ T WaveRotate(T value, uint delta)
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_metal_spirv, subgroup_rotate)]
+[require(cuda_glsl_metal_spirv, subgroup_rotate)]
vector<T, N> WaveRotate(vector<T, N> value, uint delta)
{
shader_subgroup_preamble<T>();
@@ -16330,6 +16365,8 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta)
{
case glsl:
__intrinsic_asm "subgroupRotate";
+ case cuda:
+ __intrinsic_asm "_slang_waveRotate($0, $1)";
case metal:
__intrinsic_asm "simd_shuffle_rotate_down";
case spirv:
@@ -16344,7 +16381,7 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta)
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_spirv, subgroup_rotate)]
+[require(cuda_glsl_spirv, subgroup_rotate)]
T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
{
shader_subgroup_preamble<T>();
@@ -16352,6 +16389,8 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
{
case glsl:
__intrinsic_asm "subgroupClusteredRotate";
+ case cuda:
+ __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)";
case spirv:
return spirv_asm
{
@@ -16364,7 +16403,7 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_spirv, subgroup_rotate)]
+[require(cuda_glsl_spirv, subgroup_rotate)]
vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
{
shader_subgroup_preamble<T>();
@@ -16372,6 +16411,8 @@ vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint
{
case glsl:
__intrinsic_asm "subgroupClusteredRotate";
+ case cuda:
+ __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)";
case spirv:
return spirv_asm
{
@@ -16978,7 +17019,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence)
__glsl_extension(GL_EXT_shader_quad_control)
__spirv_version(1.3)
[ForceInline]
-[require(glsl_hlsl_metal_spirv, quad_control)]
+[require(cuda_glsl_hlsl_metal_spirv, quad_control)]
bool QuadAny(bool expr)
{
__requireMaximallyReconverges();
@@ -16988,6 +17029,7 @@ bool QuadAny(bool expr)
case hlsl: __intrinsic_asm "QuadAny";
case glsl: __intrinsic_asm "subgroupQuadAny";
case metal: __intrinsic_asm "quad_any";
+ case cuda: __intrinsic_asm "_slang_quadAny";
case spirv:
return spirv_asm
{
@@ -17003,7 +17045,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence)
__glsl_extension(GL_EXT_shader_quad_control)
__spirv_version(1.3)
[ForceInline]
-[require(glsl_hlsl_metal_spirv, quad_control)]
+[require(cuda_glsl_hlsl_metal_spirv, quad_control)]
bool QuadAll(bool expr)
{
__requireMaximallyReconverges();
@@ -17013,6 +17055,7 @@ bool QuadAll(bool expr)
case hlsl: __intrinsic_asm "QuadAll";
case glsl: __intrinsic_asm "subgroupQuadAll";
case metal: __intrinsic_asm "quad_all";
+ case cuda: __intrinsic_asm "_slang_quadAll";
case spirv:
return spirv_asm
{
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index 822356312..ff9697f7d 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -2226,6 +2226,7 @@ alias subgroup_partitioned = _sm_6_5
/// [Compound]
alias subgroup_rotate = GL_KHR_shader_subgroup_rotate
| metal
+ | _cuda_sm_5_0
;
/// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
@@ -2260,6 +2261,7 @@ alias helper_lane = _sm_6_0 + fragment
alias quad_control = _sm_6_7
| GL_EXT_shader_quad_control + GL_EXT_maximal_reconvergence + GL_KHR_shader_subgroup_vote
| metal
+ | _cuda_sm_5_0
;
/// Capabilities required to enable shader breakpoints
diff --git a/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang b/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang
index 355729d93..da5af8a5c 100644
--- a/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang
+++ b/tests/hlsl-intrinsic/atomic/atomic-intrinsics-64bit.slang
@@ -1,4 +1,5 @@
-//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=DX12):-slang -compute -dx12 -profile cs_6_6 -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -dx12 -profile cs_6_6 -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHK):-slang -compute -cuda -profile cs_6_6 -shaderobj -output-using-type
// This is to support 64-bit `Interlocked*` functions defined for HLSL SM6.6
// https://microsoft.github.io/DirectX-Specs/d3d/HLSL_SM_6_6_Int64_and_Float_Atomics.html
@@ -336,7 +337,7 @@ void computeMain(uint groupIndex : SV_GroupIndex, int3 dispatchThreadID: SV_Disp
outputBuffer[idx] = int(result);
}
-// DX12: 1
-// DX12-NEXT: 1
-// DX12-NEXT: 1
-// DX12-NEXT: 1
+// CHK: 1
+// CHK-NEXT: 1
+// CHK-NEXT: 1
+// CHK-NEXT: 1
diff --git a/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang b/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang
index 20c36c2be..6dfd1d883 100644
--- a/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang
+++ b/tests/hlsl-intrinsic/quad-control/quad-control-comp-functionality.slang
@@ -1,6 +1,7 @@
//TEST(compute):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -emit-spirv-directly
//TEST(compute):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -emit-spirv-via-glsl
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -profile cs_6_7 -dx12 -shaderobj -render-feature hardware-device
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -profile cs_6_7 -shaderobj -render-feature hardware-device
//TEST(compute):COMPARE_COMPUTE_EX:-metal -compute -shaderobj -xslang -DMETAL
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
diff --git a/tests/hlsl-intrinsic/subgroup-quad.slang b/tests/hlsl-intrinsic/subgroup-quad.slang
index 1cfbffb49..ec5a80e56 100644
--- a/tests/hlsl-intrinsic/subgroup-quad.slang
+++ b/tests/hlsl-intrinsic/subgroup-quad.slang
@@ -2,6 +2,7 @@
//TEST:SIMPLE(filecheck=SPIRV): -entry main -stage compute -target spirv -emit-spirv-directly
//TEST:SIMPLE(filecheck=HLSL): -entry main -stage compute -target hlsl
//TEST:SIMPLE(filecheck=METAL): -entry main -stage compute -target metal
+//TEST:SIMPLE(filecheck=CUDA): -entry main -stage compute -target cuda
RWStructuredBuffer<float> output;
@@ -51,4 +52,13 @@ void main()
// METAL: ^ 3
// METAL: quad_shuffle
// METAL: quad_shuffle
+
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, (_getLaneId() & 0xFFFFFFFC) | ((1U) & 3))
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, (_getLaneId() & 0xFFFFFFFC) | ((1U) & 3))
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 1)
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 1)
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 2)
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 2)
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 3)
+ // CUDA: _waveShuffleMultiple({{.*}}, {{.*}}, _getLaneId() ^ 3)
}
diff --git a/tests/hlsl-intrinsic/texture-2d-gather.slang b/tests/hlsl-intrinsic/texture-2d-gather.slang
new file mode 100644
index 000000000..329041f4d
--- /dev/null
+++ b/tests/hlsl-intrinsic/texture-2d-gather.slang
@@ -0,0 +1,58 @@
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-dx12 -compute -profile cs_6_0 -shaderobj -output-using-type
+
+// Test CUDA Gather runtime behavior - compare with known gather pattern
+// tex2Dgather samples 4 texels in 2x2 pattern around coordinate
+
+//TEST_INPUT: Texture2D(size=4, content = one):name testTexture
+// Create a 4x4 texture with 1.0 values - simple but non-zero to verify gather works
+Texture2D<float4> testTexture;
+
+//TEST_INPUT: Sampler:name samplerState
+SamplerState samplerState;
+
+//TEST_INPUT: ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
+RWStructuredBuffer<float> outputBuffer;
+
+[numthreads(1, 1, 1)]
+void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+ // Simple gather test - sample at center of 2x2 region
+ // This should gather from texels (0,0), (1,0), (0,1), (1,1)
+ float2 coords = float2(0.75, 0.75); // Between texels for gather
+
+ // Test basic gather - should return 4 values in specific order
+ float4 gathered = testTexture.GatherRed(samplerState, coords);
+
+ // Store the gathered values
+ outputBuffer[0] = gathered.x; // Should be consistent pattern
+ outputBuffer[1] = gathered.y;
+ outputBuffer[2] = gathered.z;
+ outputBuffer[3] = gathered.w;
+
+ // Also test that gather actually works by using texture coordinates
+ // as the texture values (coord-based pattern)
+ int2 texelCoord = int2(dispatchThreadID.xy);
+ float coordValue = float(texelCoord.x + texelCoord.y * 4); // Create pattern: 0,1,2,3,4,5,6,7...
+
+ // Store marker value like CUDA reference (42)
+ outputBuffer[4] = 42.0; // Marker to verify test is working
+
+ // Test another gather position
+ float4 gathered2 = testTexture.GatherRed(samplerState, float2(1.25, 1.25));
+ outputBuffer[5] = gathered2.x;
+ outputBuffer[6] = gathered2.y;
+ outputBuffer[7] = gathered2.z;
+ outputBuffer[8] = gathered2.w;
+}
+
+// Test results - texture filled with 1.0 values
+// CHECK: 1.0
+// CHECK-NEXT: 1.0
+// CHECK-NEXT: 1.0
+// CHECK-NEXT: 1.0
+// CHECK-NEXT: 42.0
+// CHECK-NEXT: 1.0
+// CHECK-NEXT: 1.0
+// CHECK-NEXT: 1.0
+// CHECK-NEXT: 1.0
diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
index 81601e9be..a5be09b0b 100644
--- a/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
+++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate-clustered.slang
@@ -1,5 +1,6 @@
//TEST_CATEGORY(wave, compute)
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -profile sm_6_0
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -profile sm_6_0 -Xslang... -capability GL_KHR_shader_subgroup_rotate -X.
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl -profile sm_6_0 -allow-glsl -Xslang... -DUSE_GLSL_SYNTAX -capability GL_KHR_shader_subgroup_rotate -X.
diff --git a/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
index 353afbb35..f67005078 100644
--- a/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
+++ b/tests/hlsl-intrinsic/wave-rotate/wave-rotate.slang
@@ -1,5 +1,6 @@
// TEST_CATEGORY(wave, compute)
// TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -compute -shaderobj -emit-spirv-directly
// TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-slang -compute -vk -shaderobj -emit-spirv-via-glsl -profile sm_6_0 -Xslang... -capability GL_KHR_shader_subgroup_rotate -X.
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-metal -compute -shaderobj -xslang -DMETAL