summaryrefslogtreecommitdiffstats
path: root/source/slang
diff options
context:
space:
mode:
authorHarsh Aggarwal (NVIDIA) <haaggarwal@nvidia.com>2025-09-04 10:58:02 +0530
committerGitHub <noreply@github.com>2025-09-04 05:28:02 +0000
commit5ec41675d817f82a7ce3c4d79c68548db0bd4227 (patch)
tree57abff17713b5d9ea876be29e3b451c9abe8c49d /source/slang
parentb45706b3f532f85525de5746f1f607ba2e57fc88 (diff)
Enable CUDA support for additional HLSL intrinsic tests (#8293)
Enable CUDA support for additional HLSL intrinsic tests by implementing missing functionality and fixing compiler bugs affecting CUDA targets. - Fix critical bug in InterlockedCompareStore64 where division used /4 instead of /8 for 64-bit types, causing incorrect memory addressing for all signed int 64_t atomics - Add signed int64_t atomic wrappers (atomicExch, atomicCAS) to CUDA prelu de that properly cast to/from unsigned types as required by CUDA's atomic API - Enable tests: atomic-intrinsics-64bit.slang - Implement CUDA support for QuadAny and QuadAll operations using warp shu ffle primitives (__shfl_sync with quad-level lane masking) - Add CUDA to quad_control capability definition in slang-capabilities.capdef - Add _slang_quadAny/_slang_quadAll helper functions to CUDA prelude - Enable tests: quad-control-comp-functionality.slang, subgroup-quad.slang --------- Co-authored-by: szihs <675653+szihs@users.noreply.github.com>
Diffstat (limited to 'source/slang')
-rw-r--r--source/slang/hlsl.meta.slang93
-rw-r--r--source/slang/slang-capabilities.capdef2
2 files changed, 70 insertions, 25 deletions
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index 2af0dbcf7..d5f8cd2e1 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -3480,23 +3480,31 @@ ${{{{
const char* componentArg = (isShadow ? "" : componentArgString[componentId]);
}}}}
[ForceInline]
- [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)]
+ [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)]
vector<T.Element,4> Gather$(compareFunc)$(componentFunc)(
$(samplerParam)
vector<float, Shape.dimensions+isArray> location
$(compareParam))
{
- static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
- "Gather is supported only for 2D and 3D textures");
-
__target_switch
{
- case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case hlsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
+ __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case cuda:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures");
+ static_assert(isArray == 0, "CUDA Gather does not support texture arrays");
+ __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))";
case metal:
case wgsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)($(getTexture) $(getSampler), location $(compareArg) $(componentArg));
case glsl:
case spirv:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)(this $(samplerArg), location $(compareArg) $(componentArg));
}
}
@@ -3519,24 +3527,33 @@ ${{{{
}
[ForceInline]
- [require(glsl_hlsl_metal_spirv_wgsl, texture_gather)]
+ [require(cuda_glsl_hlsl_metal_spirv_wgsl, texture_gather)]
vector<T.Element,4> Gather$(compareFunc)$(componentFunc)(
$(samplerParam)
vector<float, Shape.dimensions+isArray> location
$(compareParam),
constexpr vector<int, Shape.planeDimensions> offset)
{
- static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
- "Gather is supported only for 2D and 3D textures");
-
__target_switch
{
- case hlsl: __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case hlsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
+ __intrinsic_asm ".Gather$(compareFunc)$(componentFunc)";
+ case cuda:
+ // CUDA tex2Dgather doesn't support offset - ignore offset parameter
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D), "CUDA Gather is supported only for 2D textures");
+ static_assert(isArray == 0, "CUDA Gather does not support texture arrays");
+ __intrinsic_asm "tex2Dgather<$T0>($0, ($2).x, ($2).y$(componentArg))";
case metal:
case wgsl:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)_offset($(getTexture) $(getSampler), location $(compareArg), offset $(componentArg));
case glsl:
case spirv:
+ static_assert(Shape.flavor == $(SLANG_TEXTURE_2D) || Shape.flavor == $(SLANG_TEXTURE_CUBE),
+ "Gather is supported only for 2D and 3D textures");
return __texture_gather$(compareFunc)_offset(this $(samplerArg), location $(compareArg), offset $(componentArg));
}
}
@@ -5704,7 +5721,7 @@ ${{{{
case hlsl: __intrinsic_asm ".InterlockedCompareStore64";
default:
let buf = __getEquivalentStructuredBuffer<T>(this);
- __atomic_compare_exchange(buf[byteAddress / 4], compareValue, value);
+ __atomic_compare_exchange(buf[byteAddress / 8], compareValue, value);
return;
}
}
@@ -14917,7 +14934,7 @@ matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr)
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
-[require(glsl_hlsl_metal_spirv, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)]
T QuadReadLaneAt(T sourceValue, uint quadLaneID)
{
__target_switch
@@ -14934,12 +14951,14 @@ T QuadReadLaneAt(T sourceValue, uint quadLaneID)
OpCapability GroupNonUniformQuad;
result:$$T = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID;
};
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))";
}
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
-[require(glsl_hlsl_metal_spirv, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv, subgroup_quad)]
vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID)
{
__target_switch
@@ -14955,6 +14974,8 @@ vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID)
OpCapability GroupNonUniformQuad;
result:$$vector<T,N> = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID;
};
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, (_getLaneId() & 0xFFFFFFFC) | ($1 & 3))";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID);
@@ -14965,7 +14986,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
T QuadReadAcrossX(T localValue)
{
__target_switch
@@ -14982,6 +15003,8 @@ T QuadReadAcrossX(T localValue)
result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapX";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)";
}
}
@@ -14990,7 +15013,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
vector<T,N> QuadReadAcrossX(vector<T,N> localValue)
{
__target_switch
@@ -15007,6 +15030,8 @@ vector<T,N> QuadReadAcrossX(vector<T,N> localValue)
result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapX";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 1)";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossX(matrix<T,N,M> localValue);
@@ -15017,7 +15042,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
T QuadReadAcrossY(T localValue)
{
__target_switch
@@ -15034,6 +15059,8 @@ T QuadReadAcrossY(T localValue)
result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapY";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)";
}
}
__generic<T : __BuiltinType, let N : int>
@@ -15041,7 +15068,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
vector<T,N> QuadReadAcrossY(vector<T,N> localValue)
{
__target_switch
@@ -15058,6 +15085,8 @@ vector<T,N> QuadReadAcrossY(vector<T,N> localValue)
result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapY";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 2)";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossY(matrix<T,N,M> localValue);
@@ -15068,7 +15097,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
T QuadReadAcrossDiagonal(T localValue)
{
__target_switch
@@ -15085,6 +15114,8 @@ T QuadReadAcrossDiagonal(T localValue)
result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapDiagonal";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)";
}
}
__generic<T : __BuiltinType, let N : int>
@@ -15092,7 +15123,7 @@ __glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
__wgsl_extension(subgroups)
[ForceInline]
-[require(glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
+[require(cuda_glsl_hlsl_metal_spirv_wgsl, subgroup_quad)]
vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue)
{
__target_switch
@@ -15109,6 +15140,8 @@ vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue)
result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
};
case wgsl: __intrinsic_asm "quadSwapDiagonal";
+ case cuda:
+ __intrinsic_asm "_waveShuffleMultiple(0xFFFFFFFF, $0, _getLaneId() ^ 3)";
}
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);
@@ -16300,7 +16333,7 @@ void shader_subgroup_preamble()
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_metal_spirv, subgroup_rotate)]
+[require(cuda_glsl_metal_spirv, subgroup_rotate)]
T WaveRotate(T value, uint delta)
{
shader_subgroup_preamble<T>();
@@ -16308,6 +16341,8 @@ T WaveRotate(T value, uint delta)
{
case glsl:
__intrinsic_asm "subgroupRotate";
+ case cuda:
+ __intrinsic_asm "__shfl_sync(0xFFFFFFFF, $0, (_getLaneId() + $1) % 32)";
case metal:
__intrinsic_asm "simd_shuffle_rotate_down";
case spirv:
@@ -16322,7 +16357,7 @@ T WaveRotate(T value, uint delta)
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_metal_spirv, subgroup_rotate)]
+[require(cuda_glsl_metal_spirv, subgroup_rotate)]
vector<T, N> WaveRotate(vector<T, N> value, uint delta)
{
shader_subgroup_preamble<T>();
@@ -16330,6 +16365,8 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta)
{
case glsl:
__intrinsic_asm "subgroupRotate";
+ case cuda:
+ __intrinsic_asm "_slang_waveRotate($0, $1)";
case metal:
__intrinsic_asm "simd_shuffle_rotate_down";
case spirv:
@@ -16344,7 +16381,7 @@ vector<T, N> WaveRotate(vector<T, N> value, uint delta)
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_spirv, subgroup_rotate)]
+[require(cuda_glsl_spirv, subgroup_rotate)]
T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
{
shader_subgroup_preamble<T>();
@@ -16352,6 +16389,8 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
{
case glsl:
__intrinsic_asm "subgroupClusteredRotate";
+ case cuda:
+ __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)";
case spirv:
return spirv_asm
{
@@ -16364,7 +16403,7 @@ T WaveClusteredRotate(T value, uint delta, constexpr uint clusterSize)
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_rotate)
-[require(glsl_spirv, subgroup_rotate)]
+[require(cuda_glsl_spirv, subgroup_rotate)]
vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint clusterSize)
{
shader_subgroup_preamble<T>();
@@ -16372,6 +16411,8 @@ vector<T, N> WaveClusteredRotate(vector<T, N> value, uint delta, constexpr uint
{
case glsl:
__intrinsic_asm "subgroupClusteredRotate";
+ case cuda:
+ __intrinsic_asm "_slang_waveClusteredRotate($0, $1, $2)";
case spirv:
return spirv_asm
{
@@ -16978,7 +17019,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence)
__glsl_extension(GL_EXT_shader_quad_control)
__spirv_version(1.3)
[ForceInline]
-[require(glsl_hlsl_metal_spirv, quad_control)]
+[require(cuda_glsl_hlsl_metal_spirv, quad_control)]
bool QuadAny(bool expr)
{
__requireMaximallyReconverges();
@@ -16988,6 +17029,7 @@ bool QuadAny(bool expr)
case hlsl: __intrinsic_asm "QuadAny";
case glsl: __intrinsic_asm "subgroupQuadAny";
case metal: __intrinsic_asm "quad_any";
+ case cuda: __intrinsic_asm "_slang_quadAny";
case spirv:
return spirv_asm
{
@@ -17003,7 +17045,7 @@ __glsl_extension(GL_EXT_maximal_reconvergence)
__glsl_extension(GL_EXT_shader_quad_control)
__spirv_version(1.3)
[ForceInline]
-[require(glsl_hlsl_metal_spirv, quad_control)]
+[require(cuda_glsl_hlsl_metal_spirv, quad_control)]
bool QuadAll(bool expr)
{
__requireMaximallyReconverges();
@@ -17013,6 +17055,7 @@ bool QuadAll(bool expr)
case hlsl: __intrinsic_asm "QuadAll";
case glsl: __intrinsic_asm "subgroupQuadAll";
case metal: __intrinsic_asm "quad_all";
+ case cuda: __intrinsic_asm "_slang_quadAll";
case spirv:
return spirv_asm
{
diff --git a/source/slang/slang-capabilities.capdef b/source/slang/slang-capabilities.capdef
index 822356312..ff9697f7d 100644
--- a/source/slang/slang-capabilities.capdef
+++ b/source/slang/slang-capabilities.capdef
@@ -2226,6 +2226,7 @@ alias subgroup_partitioned = _sm_6_5
/// [Compound]
alias subgroup_rotate = GL_KHR_shader_subgroup_rotate
| metal
+ | _cuda_sm_5_0
;
/// (All implemented targets) Capabilities required to use atomic operations of GLSL tier-1 float atomics
@@ -2260,6 +2261,7 @@ alias helper_lane = _sm_6_0 + fragment
alias quad_control = _sm_6_7
| GL_EXT_shader_quad_control + GL_EXT_maximal_reconvergence + GL_KHR_shader_subgroup_vote
| metal
+ | _cuda_sm_5_0
;
/// Capabilities required to enable shader breakpoints