summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com>2025-08-20 14:41:06 +0530
committerGitHub <noreply@github.com>2025-08-20 09:11:06 +0000
commite0c20a076f2ec84586b6508664df4f59273c6aaf (patch)
treeae629eb56413f1ffd1d269ffe447471c07aa8137
parente4a7129b84692ddc3c586f0d0dde95e80e173ed8 (diff)
Updated support to enable batch3 (#8219)
Enable CUDA support for batch 3 tests - Enhanced wave operations with exclusive support - Added proper identity values for min/max operations - Fixed intrinsic name mapping issues - Updated test configurations Co-authored-by: Ellie Hermaszewska <ellieh@nvidia.com>
-rw-r--r--prelude/slang-cuda-prelude.h295
-rw-r--r--source/slang/hlsl.meta.slang16
-rw-r--r--tests/autodiff-dstdlib/determinant.slang1
-rw-r--r--tests/hlsl-intrinsic/byte-address-buffer/byte-address-half-atomics.slang1
-rw-r--r--tests/hlsl-intrinsic/debug-break.slang5
-rw-r--r--tests/hlsl-intrinsic/scalar-double-vk-intrinsic.slang5
-rw-r--r--tests/hlsl-intrinsic/wave-active-count-bits.slang3
-rw-r--r--tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang1
-rw-r--r--tests/hlsl-intrinsic/wave-is-first-lane.slang2
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang2
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang2
-rw-r--r--tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang2
-rw-r--r--tests/hlsl-intrinsic/wave-prefix-product.slang2
-rw-r--r--tests/hlsl-intrinsic/wave-prefix-sum.slang2
-rw-r--r--tests/hlsl-intrinsic/wave-vector.slang2
15 files changed, 321 insertions, 20 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 9df2727f6..44afd71b9 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -2661,17 +2661,67 @@ struct WaveOpMul
template<typename T>
struct WaveOpMax
{
- __inline__ __device__ static T getInitial(T a) { return a; }
+ __inline__ __device__ static T getInitial(T a, bool exclusive = false);
__inline__ __device__ static T doOp(T a, T b) { return a > b ? a : b; }
};
template<typename T>
struct WaveOpMin
{
- __inline__ __device__ static T getInitial(T a) { return a; }
+ __inline__ __device__ static T getInitial(T a, bool exclusive = false);
__inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; }
};
+// Compact specializations using macro for getInitial
+#define SLANG_WAVE_MIN_SPEC(T, EXCL_VAL) \
+ template<> \
+ __inline__ __device__ T WaveOpMin<T>::getInitial(T a, bool exclusive) \
+ { \
+ return exclusive ? (EXCL_VAL) : a; \
+ }
+
+#define SLANG_WAVE_MAX_SPEC(T, EXCL_VAL) \
+ template<> \
+ __inline__ __device__ T WaveOpMax<T>::getInitial(T a, bool exclusive) \
+ { \
+ return exclusive ? (EXCL_VAL) : a; \
+ }
+
+// Min specializations (exclusive identity = max value)
+SLANG_WAVE_MIN_SPEC(float, SLANG_INFINITY)
+SLANG_WAVE_MIN_SPEC(double, SLANG_INFINITY)
+SLANG_WAVE_MIN_SPEC(int, 0x7FFFFFFF)
+SLANG_WAVE_MIN_SPEC(uint, 0xFFFFFFFF)
+SLANG_WAVE_MIN_SPEC(char, (char)0x7F)
+SLANG_WAVE_MIN_SPEC(int8_t, (int8_t)0x7F)
+SLANG_WAVE_MIN_SPEC(uint8_t, (uint8_t)0xFF)
+SLANG_WAVE_MIN_SPEC(int16_t, (int16_t)0x7FFF)
+SLANG_WAVE_MIN_SPEC(uint16_t, (uint16_t)0xFFFF)
+SLANG_WAVE_MIN_SPEC(int64_t, 0x7FFFFFFFFFFFFFFFLL)
+SLANG_WAVE_MIN_SPEC(uint64_t, 0xFFFFFFFFFFFFFFFFULL)
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_WAVE_MIN_SPEC(__half, __ushort_as_half(0x7BFF))
+#endif
+
+// Max specializations (exclusive identity = min value)
+SLANG_WAVE_MAX_SPEC(float, -SLANG_INFINITY)
+SLANG_WAVE_MAX_SPEC(double, -SLANG_INFINITY)
+SLANG_WAVE_MAX_SPEC(int, (int)0x80000000)
+SLANG_WAVE_MAX_SPEC(uint, 0)
+SLANG_WAVE_MAX_SPEC(char, (char)0x80)
+SLANG_WAVE_MAX_SPEC(int8_t, (int8_t)0x80)
+SLANG_WAVE_MAX_SPEC(uint8_t, 0)
+SLANG_WAVE_MAX_SPEC(int16_t, (int16_t)0x8000)
+SLANG_WAVE_MAX_SPEC(uint16_t, 0)
+SLANG_WAVE_MAX_SPEC(int64_t, (int64_t)0x8000000000000000LL)
+SLANG_WAVE_MAX_SPEC(uint64_t, 0)
+#if SLANG_CUDA_ENABLE_HALF
+SLANG_WAVE_MAX_SPEC(__half, __ushort_as_half(0xFBFF))
+#endif
+
+#undef SLANG_WAVE_MIN_SPEC
+#undef SLANG_WAVE_MAX_SPEC
+
template<typename T>
struct ElementTypeTrait;
@@ -2706,6 +2756,33 @@ struct ElementTypeTrait<int64_t>
{
typedef int64_t Type;
};
+template<>
+struct ElementTypeTrait<char>
+{
+ typedef char Type;
+};
+template<>
+struct ElementTypeTrait<uchar>
+{
+ typedef uchar Type;
+};
+template<>
+struct ElementTypeTrait<short>
+{
+ typedef short Type;
+};
+template<>
+struct ElementTypeTrait<ushort>
+{
+ typedef ushort Type;
+};
+#if SLANG_CUDA_ENABLE_HALF
+template<>
+struct ElementTypeTrait<__half>
+{
+ typedef __half Type;
+};
+#endif
// Vector
template<>
@@ -2792,6 +2869,115 @@ struct ElementTypeTrait<double4>
typedef double Type;
};
+// Additional vector types
+template<>
+struct ElementTypeTrait<char2>
+{
+ typedef char Type;
+};
+template<>
+struct ElementTypeTrait<char3>
+{
+ typedef char Type;
+};
+template<>
+struct ElementTypeTrait<char4>
+{
+ typedef char Type;
+};
+template<>
+struct ElementTypeTrait<uchar2>
+{
+ typedef uchar Type;
+};
+template<>
+struct ElementTypeTrait<uchar3>
+{
+ typedef uchar Type;
+};
+template<>
+struct ElementTypeTrait<uchar4>
+{
+ typedef uchar Type;
+};
+template<>
+struct ElementTypeTrait<short2>
+{
+ typedef short Type;
+};
+template<>
+struct ElementTypeTrait<short3>
+{
+ typedef short Type;
+};
+template<>
+struct ElementTypeTrait<short4>
+{
+ typedef short Type;
+};
+template<>
+struct ElementTypeTrait<ushort2>
+{
+ typedef ushort Type;
+};
+template<>
+struct ElementTypeTrait<ushort3>
+{
+ typedef ushort Type;
+};
+template<>
+struct ElementTypeTrait<ushort4>
+{
+ typedef ushort Type;
+};
+template<>
+struct ElementTypeTrait<longlong2>
+{
+ typedef int64_t Type;
+};
+template<>
+struct ElementTypeTrait<longlong3>
+{
+ typedef int64_t Type;
+};
+template<>
+struct ElementTypeTrait<longlong4>
+{
+ typedef int64_t Type;
+};
+template<>
+struct ElementTypeTrait<ulonglong2>
+{
+ typedef uint64_t Type;
+};
+template<>
+struct ElementTypeTrait<ulonglong3>
+{
+ typedef uint64_t Type;
+};
+template<>
+struct ElementTypeTrait<ulonglong4>
+{
+ typedef uint64_t Type;
+};
+#if SLANG_CUDA_ENABLE_HALF
+template<>
+struct ElementTypeTrait<__half2>
+{
+ typedef __half Type;
+};
+template<>
+struct ElementTypeTrait<__half3>
+{
+ typedef __half Type;
+};
+template<>
+struct ElementTypeTrait<__half4>
+{
+ typedef __half Type;
+};
+#endif
+
// Matrix
template<typename T, int ROWS, int COLS>
struct ElementTypeTrait<Matrix<T, ROWS, COLS>>
@@ -3431,6 +3617,111 @@ __inline__ __device__ T _wavePrefixAndMultiple(WarpMask mask, T val)
}
template<typename T>
+__inline__ __device__ T _wavePrefixMin(WarpMask mask, T val)
+{
+ return _wavePrefixScalar<WaveOpMin<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixMax(WarpMask mask, T val)
+{
+ return _wavePrefixScalar<WaveOpMax<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixMinMultiple(WarpMask mask, T val)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixMultiple<WaveOpMin<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+ mask,
+ (ElemType*)&val);
+ return val;
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixMaxMultiple(WarpMask mask, T val)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixMultiple<WaveOpMax<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+ mask,
+ (ElemType*)&val);
+ return val;
+}
+
+// Wrapper structures for exclusive operations that use the overloaded getInitial method
+template<typename T>
+struct WaveOpExclusiveMin
+{
+ __inline__ __device__ static T getInitial(T a) { return WaveOpMin<T>::getInitial(a, true); }
+ __inline__ __device__ static T doOp(T a, T b) { return WaveOpMin<T>::doOp(a, b); }
+};
+
+template<typename T>
+struct WaveOpExclusiveMax
+{
+ __inline__ __device__ static T getInitial(T a) { return WaveOpMax<T>::getInitial(a, true); }
+ __inline__ __device__ static T doOp(T a, T b) { return WaveOpMax<T>::doOp(a, b); }
+};
+
+// Inclusive prefix min/max functions (for WaveMultiPrefixInclusive*)
+template<typename T>
+__inline__ __device__ T _wavePrefixInclusiveMin(WarpMask mask, T val)
+{
+ return _wavePrefixMin(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixInclusiveMax(WarpMask mask, T val)
+{
+ return _wavePrefixMax(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixInclusiveMinMultiple(WarpMask mask, T val)
+{
+ return _wavePrefixMinMultiple(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixInclusiveMaxMultiple(WarpMask mask, T val)
+{
+ return _wavePrefixMaxMultiple(mask, val);
+}
+
+// Explicit exclusive prefix min/max functions (for WaveMultiPrefixExclusive*)
+template<typename T>
+__inline__ __device__ T _wavePrefixExclusiveMin(WarpMask mask, T val)
+{
+ return _wavePrefixScalar<WaveOpExclusiveMin<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixExclusiveMax(WarpMask mask, T val)
+{
+ return _wavePrefixScalar<WaveOpExclusiveMax<T>, T>(mask, val);
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixExclusiveMinMultiple(WarpMask mask, T val)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixMultiple<WaveOpExclusiveMin<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+ mask,
+ (ElemType*)&val);
+ return val;
+}
+
+template<typename T>
+__inline__ __device__ T _wavePrefixExclusiveMaxMultiple(WarpMask mask, T val)
+{
+ typedef typename ElementTypeTrait<T>::Type ElemType;
+ _wavePrefixMultiple<WaveOpExclusiveMax<ElemType>, ElemType, sizeof(T) / sizeof(ElemType)>(
+ mask,
+ (ElemType*)&val);
+ return val;
+}
+
+template<typename T>
__inline__ __device__ uint4 _waveMatchScalar(WarpMask mask, T val)
{
int pred;
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 0d5b8cb1f..d2e98529b 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -8578,7 +8578,7 @@ matrix<T, N, M> degrees(matrix<T, N, M> x)
__generic<T : __BuiltinFloatingPointType, let N : int>
[__readNone]
[PreferCheckpoint]
-[require(glsl_hlsl_metal_spirv_wgsl)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl)]
T determinant(matrix<T,N,N> m)
{
__target_switch
@@ -8590,6 +8590,7 @@ T determinant(matrix<T,N,N> m)
OpExtInst $$T result glsl450 Determinant $m
};
case wgsl: __intrinsic_asm "determinant";
+ case cuda:
default:
static_assert(N >= 1 && N <= 4, "determinant is only implemented up to 4x4 matrices");
if (N == 1)
@@ -8629,13 +8630,14 @@ T determinant(matrix<T,N,N> m)
/// @category math
__generic<T : __BuiltinIntegerType, let N : int>
[__readNone]
-[require(glsl_hlsl_metal_spirv_wgsl)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl)]
T determinant(matrix<T,N,N> m)
{
__target_switch
{
case hlsl: __intrinsic_asm "determinant";
- // GLSL, WGSL, and SPIR-V don't support integer determinants for lowered matrices, so we need to implement it manually
+ // GLSL, WGSL, SPIR-V, and CUDA don't support integer determinants for lowered matrices, so we need to implement it manually
+ case cuda:
default:
static_assert(N >= 1 && N <= 4, "determinant is only implemented up to 4x4 matrices");
if (N == 1)
@@ -16680,7 +16682,7 @@ for (auto opName : kWaveMultiPrefixMinMaxNames) {
__generic<T : __BuiltinArithmeticType>
__spirv_version(1.3)
[ForceInline]
-[require(glsl_spirv, subgroup_partitioned)]
+[require(cuda_glsl_spirv, subgroup_partitioned)]
T WaveMultiPrefix$(opName.name)(T value, uint4 mask)
{
__shaderSubgroupPartitionedPreamble<T>();
@@ -16688,6 +16690,8 @@ T WaveMultiPrefix$(opName.name)(T value, uint4 mask)
{
case glsl:
__intrinsic_asm "subgroupPartitioned$(opName.glslName)NV";
+ case cuda:
+ __intrinsic_asm "_wavePrefix$(opName.name)(($1).x, $0)";
case spirv:
{
if (__isFloat<T>())
@@ -16703,7 +16707,7 @@ T WaveMultiPrefix$(opName.name)(T value, uint4 mask)
__generic<T : __BuiltinArithmeticType, let N : int>
__spirv_version(1.3)
[ForceInline]
-[require(glsl_spirv, subgroup_partitioned)]
+[require(cuda_glsl_spirv, subgroup_partitioned)]
vector<T, N> WaveMultiPrefix$(opName.name)(vector<T, N> value, uint4 mask)
{
__shaderSubgroupPartitionedPreamble<T>();
@@ -16711,6 +16715,8 @@ vector<T, N> WaveMultiPrefix$(opName.name)(vector<T, N> value, uint4 mask)
{
case glsl:
__intrinsic_asm "subgroupPartitioned$(opName.glslName)NV";
+ case cuda:
+ __intrinsic_asm "_wavePrefix$(opName.name)Multiple(($1).x, $0)";
case spirv:
{
if (__isFloat<T>())
diff --git a/tests/autodiff-dstdlib/determinant.slang b/tests/autodiff-dstdlib/determinant.slang
index d2e699551..f73d3cdfa 100644
--- a/tests/autodiff-dstdlib/determinant.slang
+++ b/tests/autodiff-dstdlib/determinant.slang
@@ -1,5 +1,6 @@
//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -output-using-type
//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -output-using-type
+//TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -shaderobj -output-using-type -cuda
//TEST_INPUT:ubuffer(data=[0 0 0 0], stride=4):out,name=outputBuffer
RWStructuredBuffer<float> outputBuffer;
diff --git a/tests/hlsl-intrinsic/byte-address-buffer/byte-address-half-atomics.slang b/tests/hlsl-intrinsic/byte-address-buffer/byte-address-half-atomics.slang
index f53d38d74..d23a675b2 100644
--- a/tests/hlsl-intrinsic/byte-address-buffer/byte-address-half-atomics.slang
+++ b/tests/hlsl-intrinsic/byte-address-buffer/byte-address-half-atomics.slang
@@ -3,6 +3,7 @@
// Disabled because validation layer doesn't like vector atomics, although nv driver does allow it.
//DISABLED_TEST(compute, vulkan):COMPARE_COMPUTE(filecheck-buffer=CHECK): -vk -compute -profile cs_6_2 -render-features half -shaderobj -emit-spirv-directly -output-using-type
+//TEST(compute):COMPARE_COMPUTE(filecheck-buffer=CHECK): -cuda -compute -profile cs_6_2 -render-features half -shaderobj -emit-spirv-directly -output-using-type
//TEST:SIMPLE(filecheck=SPIRV):-target spirv -entry computeMain -stage compute -emit-spirv-directly -skip-spirv-validation
//TEST:SIMPLE(filecheck=HLSL):-target hlsl -entry computeMain -profile cs_6_3
//TEST_INPUT:set tmpBuffer = ubuffer(data=[0 0 0 0], stride=4)
diff --git a/tests/hlsl-intrinsic/debug-break.slang b/tests/hlsl-intrinsic/debug-break.slang
index c16139f72..8fd4567dc 100644
--- a/tests/hlsl-intrinsic/debug-break.slang
+++ b/tests/hlsl-intrinsic/debug-break.slang
@@ -1,7 +1,7 @@
//TEST:SIMPLE(filecheck=CHECK_GLSL):-stage compute -entry computeMain -target glsl -line-directive-mode none
// We can't enable because output source includes path to prelude.
//DISABLE_TEST:SIMPLE:-stage compute -entry computeMain -target cpp -line-directive-mode none
-//DISABLE_TEST:SIMPLE:-stage compute -entry computeMain -target cuda -line-directive-mode none
+//TEST:SIMPLE(filecheck=CHECK_CUDA):-stage compute -entry computeMain -target cuda -line-directive-mode none
// Not currently supported on HLSL
//DISABLE_TEST:SIMPLE:-stage compute -entry computeMain -target hlsl -line-directive-mode none
// With `slang-llvm` this will crash, but the call stack isn't really usable.
@@ -15,6 +15,7 @@
// CHECK_GLSL: void main
// CHECK_GLSL: glslDebugBreak{{.*}}();
+// CHECK_CUDA:__brkpt()
RWStructuredBuffer<int> outputBuffer;
[numthreads(4, 1, 1)]
@@ -25,4 +26,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
debugBreak();
outputBuffer[idx] = idx;
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/scalar-double-vk-intrinsic.slang b/tests/hlsl-intrinsic/scalar-double-vk-intrinsic.slang
index 128c1c1cd..042c0b62e 100644
--- a/tests/hlsl-intrinsic/scalar-double-vk-intrinsic.slang
+++ b/tests/hlsl-intrinsic/scalar-double-vk-intrinsic.slang
@@ -3,8 +3,7 @@
//TEST(compute):COMPARE_COMPUTE_EX:-cpu -compute -output-using-type -shaderobj
//TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute -output-using-type -shaderobj -render-feature double
-// We don't want to run a cuda test here...
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute -shaderobj -output-using-type
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
RWStructuredBuffer<double> outputBuffer;
@@ -54,4 +53,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
}
outputBuffer[idx] = ft;
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-active-count-bits.slang b/tests/hlsl-intrinsic/wave-active-count-bits.slang
index 105b95a6f..a7aa48687 100644
--- a/tests/hlsl-intrinsic/wave-active-count-bits.slang
+++ b/tests/hlsl-intrinsic/wave-active-count-bits.slang
@@ -2,6 +2,7 @@
//DISABLE_TEST:COMPARE_COMPUTE_EX:-cpu -compute -shaderobj
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -cuda -profile cs_6_0 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
@@ -14,4 +15,4 @@ void computeMain(uint3 dispatchThreadID : SV_DispatchThreadID)
int idx = int(dispatchThreadID.x);
outputBuffer[idx] = int(WaveActiveCountBits(bool(idx & 5)));
-} \ No newline at end of file
+}
diff --git a/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang b/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang
index b0cff08a9..4a32ab736 100644
--- a/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang
+++ b/tests/hlsl-intrinsic/wave-broadcast-lane-at-vk.slang
@@ -1,5 +1,6 @@
//TEST_CATEGORY(wave, compute)
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-slang -compute -cuda -profile cs_6_0 -shaderobj
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-metal -compute -shaderobj
diff --git a/tests/hlsl-intrinsic/wave-is-first-lane.slang b/tests/hlsl-intrinsic/wave-is-first-lane.slang
index 220a5758b..ae986c7b3 100644
--- a/tests/hlsl-intrinsic/wave-is-first-lane.slang
+++ b/tests/hlsl-intrinsic/wave-is-first-lane.slang
@@ -3,7 +3,7 @@
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
-//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-cuda -compute -capability cuda_sm_7_0 -shaderobj
//TEST:COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-metal -compute -shaderobj
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
index 654fd6130..0aca11f38 100644
--- a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-max.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave, compute)
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
-
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -capability cuda_sm_7_0 -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
index 68e1e9c05..321b99a0e 100644
--- a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-min.slang
@@ -1,7 +1,7 @@
//TEST_CATEGORY(wave, compute)
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-via-glsl
-
+//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-cuda -capability cuda_sm_7_0 -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX(filecheck-buffer=CHECK):-vk -compute -shaderobj -emit-spirv-directly -xslang -DUSE_GLSL_SYNTAX -allow-glsl
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
diff --git a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
index 5de34b20a..67367f264 100644
--- a/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
+++ b/tests/hlsl-intrinsic/wave-multi/wave-multi-prefix-scalar-functional.slang
@@ -4,7 +4,7 @@
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile sm_6_5 -shaderobj
//TEST:COMPARE_COMPUTE_EX:-vk -compute -shaderobj
-//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-cuda -compute -capability cuda_sm_7_0 -shaderobj
//TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
RWStructuredBuffer<uint> outputBuffer;
diff --git a/tests/hlsl-intrinsic/wave-prefix-product.slang b/tests/hlsl-intrinsic/wave-prefix-product.slang
index 774f5996e..f8c0ed57a 100644
--- a/tests/hlsl-intrinsic/wave-prefix-product.slang
+++ b/tests/hlsl-intrinsic/wave-prefix-product.slang
@@ -3,7 +3,7 @@
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
-//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-cuda -compute -capability cuda_sm_7_0 -shaderobj
//TEST:COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-metal -compute -shaderobj
diff --git a/tests/hlsl-intrinsic/wave-prefix-sum.slang b/tests/hlsl-intrinsic/wave-prefix-sum.slang
index 4f7c2912d..8a092e20d 100644
--- a/tests/hlsl-intrinsic/wave-prefix-sum.slang
+++ b/tests/hlsl-intrinsic/wave-prefix-sum.slang
@@ -3,7 +3,7 @@
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
-//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-cuda -compute -capability cuda_sm_7_0 -shaderobj
//TEST:COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-metal -compute -shaderobj
diff --git a/tests/hlsl-intrinsic/wave-vector.slang b/tests/hlsl-intrinsic/wave-vector.slang
index f786794ec..4c3b7ccb6 100644
--- a/tests/hlsl-intrinsic/wave-vector.slang
+++ b/tests/hlsl-intrinsic/wave-vector.slang
@@ -3,7 +3,7 @@
//DISABLE_TEST:COMPARE_COMPUTE_EX:-slang -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0 -shaderobj -render-feature hardware-device
//TEST(vulkan):COMPARE_COMPUTE_EX:-vk -compute -shaderobj -render-feature hardware-device
-//TEST:COMPARE_COMPUTE_EX:-cuda -compute -render-features cuda_sm_7_0 -shaderobj
+//TEST:COMPARE_COMPUTE_EX:-cuda -compute -capability cuda_sm_7_0 -shaderobj
//TEST:COMPARE_COMPUTE_EX:-wgpu -compute -shaderobj
//TEST:COMPARE_COMPUTE_EX:-metal -compute -shaderobj