From b380b1af6ba6f5f58e3841c2a5b14db7ee8c372d Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Tue, 10 Mar 2020 16:43:41 -0400 Subject: Wave Prefix Product (#1270) * Fix some typos. * Add wave-prefix-sum.slang test * First pass at implementing prefixSum. * Small improvments to prefixSum CUDA. * Small improvement to prefix sum. * Enable prefix sum in stdlib. * Wave prefix product without using a divide. * Split out SM6.5 Wave intrinsics. Template mechanism for do prefix calculations. --- source/slang/hlsl.meta.slang | 86 ++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 39 deletions(-) (limited to 'source') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index b43cd009f..20158c1b1 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2498,6 +2498,7 @@ __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveMul($0)") +__target_intrinsic(cuda, "_wavePrefixProduct($0)") T WavePrefixProduct(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -2521,10 +2522,54 @@ vector WavePrefixSum(vector expr); __generic matrix WavePrefixSum(matrix expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcastFirst($0)") +__target_intrinsic(cuda, "_waveReadFirst($0)") +T WaveReadLaneFirst(T expr); +__generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcastFirst($0)") +__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") +vector WaveReadLaneFirst(vector expr); +__generic +__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") +matrix WaveReadLaneFirst(matrix expr); + +// NOTE! On GLSL based targets the lane index *must* be a compile time expression! +// See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt +__generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcast($0, $1)") +__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)") +T WaveReadLaneAt(T value, int lane); +__generic +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBroadcast($0, $1)") +__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)") +vector WaveReadLaneAt(vector value, int lane); +__generic +__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)") +matrix WaveReadLaneAt(matrix value, int lane); + +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__spirv_version(1.3) +__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($0))") +__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())") +uint WavePrefixCountBits(bool value); + +// Shader model 6.5 stuff +// https://github.com/microsoft/DirectX-Specs/blob/master/d3d/HLSL_ShaderModel6_5.md +// TODO(JS): Looks like they need a mask parameter + __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveAnd($0)") +__target_intrinsic(cuda, "_wavePrefixAnd($0)") T WaveMultiPrefixBitAnd(T expr); __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) @@ -2538,6 +2583,7 @@ __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveOr($0)") +__target_intrinsic(cuda, "_wavePrefixOr($0)") T WaveMultiPrefixBitOr(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -2551,6 +2597,7 @@ __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupExclusiveXor($0)") +__target_intrinsic(cuda, "_wavePrefixXor($0)") T WaveMultiPrefixBitXor(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) @@ -2560,11 +2607,6 @@ vector WaveMultiPrefixBitXor(vector expr); __generic matrix WaveMultiPrefixBitXor(matrix expr); -__glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($0))") -__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())") -uint WavePrefixCountBits(bool value); uint WaveMultiPrefixCountBits(bool value, uint4 mask); @@ -2576,40 +2618,6 @@ __generic T WaveMultiPrefixSum(T value, uint4 mask) __generic vector WaveMultiPrefixSum(vector value, uint4 mask); __generic matrix WaveMultiPrefixSum(matrix value, uint4 mask); -__generic -__glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "subgroupBroadcastFirst($0)") -__target_intrinsic(cuda, "_waveReadFirst($0)") -T WaveReadLaneFirst(T expr); -__generic -__glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "subgroupBroadcastFirst($0)") -__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") -vector WaveReadLaneFirst(vector expr); -__generic -__target_intrinsic(cuda, "_waveReadFirstMultiple($0)") -matrix WaveReadLaneFirst(matrix expr); - -// NOTE! On GLSL based targets the lane index *must* be a compile time expression! -// See https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt -__generic -__glsl_extension(GL_KHR_shader_subgroup_ballot) -__spirv_version(1.3) -__target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(__activemask(), $0, $1)") -T WaveReadLaneAt(T value, int lane); -__generic -__spirv_version(1.3) -__target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)") -vector WaveReadLaneAt(vector value, int lane); -__generic -__target_intrinsic(cuda, "_waveReadLaneAtMultiple($0, $1)") -matrix WaveReadLaneAt(matrix value, int lane); - - // `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points typedef Texture2D texture2D; -- cgit v1.2.3