From dbd8e8dc0847338a2a93d35385f48b5ce5671dd6 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Mon, 2 Mar 2020 17:22:03 -0500 Subject: Feature/glsl wave intrinsic (#1253) * Test for some wave intrinsics. More wave intrinsic support on CUDA. * Use shfl_xor_sync. * Improvements around wave intrinsics. Fix built in integer types belong to __BuiltinIntegerType. * Improvements and fixes around Wave intrinsics. * Added WaveIsFirstLane test. No longer use __wavemask_lt, as appears not available as an intrinsic. * Small fixes to CUDA prelude. * Add wave-active-product test. Handle the special case for arbitray sums. * Used macro to implement CUDA wave intrinsics. * First pass at glsl wave intrinsics. Doesn't work in practice because require mechanism to set spir-v version Replace use of _lanemask_lt() for CUDA. --- source/slang/hlsl.meta.slang | 50 ++++++++++++++++++++++++++++++++++++++-- source/slang/hlsl.meta.slang.h | 52 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 97 insertions(+), 5 deletions(-) (limited to 'source') diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index edb678ad6..417f4594d 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -1379,6 +1379,9 @@ __generic matrix T QuadReadLaneAt(T sourceValue, uint quadLaneID); __generic vector QuadReadLaneAt(vector sourceValue, uint quadLaneID); __generic matrix QuadReadLaneAt(matrix sourceValue, uint quadLaneID); @@ -1396,48 +1399,64 @@ __generic vector QuadReadAcrossDiagonal(vec __generic matrix QuadReadAcrossDiagonal(matrix localValue); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupAnd($0)") __target_intrinsic(cuda, "_waveAnd(__activemask(), $0)") T WaveActiveBitAnd(T expr); __generic vector WaveActiveBitAnd(vector expr); __generic matrix WaveActiveBitAnd(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupOr($0)") __target_intrinsic(cuda, "_waveOr(__activemask(), $0)") T WaveActiveBitOr(T expr); __generic vector WaveActiveBitOr(vector expr); __generic matrix WaveActiveBitOr(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupXor($0)") __target_intrinsic(cuda, "_waveXor(__activemask(), $0)") T WaveActiveBitXor(T expr); __generic vector WaveActiveBitXor(vector expr); __generic matrix WaveActiveBitXor(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupMax($0)") __target_intrinsic(cuda, "_waveMax(__activemask(), $0)") T WaveActiveMax(T expr); __generic vector WaveActiveMax(vector expr); __generic matrix WaveActiveMax(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupMin($0)") __target_intrinsic(cuda, "_waveMin(__activemask(), $0)") T WaveActiveMin(T expr); __generic vector WaveActiveMin(vector expr); __generic matrix WaveActiveMin(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupMul($0)") __target_intrinsic(cuda, "_waveProduct(__activemask(), $0)") T WaveActiveProduct(T expr); __generic vector WaveActiveProduct(vector expr); __generic matrix WaveActiveProduct(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupAdd($0)") __target_intrinsic(cuda, "_waveSum(__activemask(), $0)") T WaveActiveSum(T expr); __generic vector WaveActiveSum(vector expr); __generic matrix WaveActiveSum(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_vote) +__target_intrinsic(glsl, "subgroupAllEqual($0)") __target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)") bool WaveActiveAllEqual(T value); __generic vector WaveActiveAllEqual(vector value); @@ -1452,24 +1471,40 @@ __generic uint4 WaveMatch(matrix +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupExcusiveMul($0)") T WavePrefixProduct(T expr); __generic vector WavePrefixProduct(vector expr); __generic matrix WavePrefixProduct(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_arithmetic) +__target_intrinsic(glsl, "subgroupExcusiveAdd($0)") T WavePrefixSum(T expr); __generic vector WavePrefixSum(vector expr); __generic matrix WavePrefixSum(matrix expr); @@ -1499,7 +1538,10 @@ T WaveMultiPrefixBitXor(T expr); __generic vector WaveMultiPrefixBitXor(vector expr); __generic matrix WaveMultiPrefixBitXor(matrix expr); -__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())") +// TODO(JS): This takes uvec4 parameter on GLSL +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount($0)") +__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())") uint WavePrefixCountBits(bool value); uint WaveMultiPrefixCountBits(bool value, uint4 mask); @@ -1513,12 +1555,16 @@ __generic vector WaveMultiPrefixS __generic matrix WaveMultiPrefixSum(matrix value, uint4 mask); __generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__target_intrinsic(glsl, "subgoupBroadcastFirst($0)") __target_intrinsic(cuda, "_waveReadFirst($0)") T WaveReadLaneFirst(T expr); __generic vector WaveReadLaneFirst(vector expr); __generic matrix WaveReadLaneFirst(matrix expr); __generic +__glsl_extension(GL_KHR_shader_subgroup_ballot) +__target_intrinsic(glsl, "subgroupBroadcast($0, $1)") __target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)") T WaveReadLaneAt(T value, int lane); __generic vector WaveReadLaneAt(vector value, int lane); diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h index 16a3244ab..34bb15808 100644 --- a/source/slang/hlsl.meta.slang.h +++ b/source/slang/hlsl.meta.slang.h @@ -1455,6 +1455,9 @@ SLANG_RAW("__generic m SLANG_RAW("\n") SLANG_RAW("// Shader model 6.0 stuff\n") SLANG_RAW("\n") +SLANG_RAW("// Information for GLSL wave/subgroup support\n") +SLANG_RAW("// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt\n") +SLANG_RAW("\n") SLANG_RAW("__generic T QuadReadLaneAt(T sourceValue, uint quadLaneID);\n") SLANG_RAW("__generic vector QuadReadLaneAt(vector sourceValue, uint quadLaneID);\n") SLANG_RAW("__generic matrix QuadReadLaneAt(matrix sourceValue, uint quadLaneID);\n") @@ -1472,48 +1475,64 @@ SLANG_RAW("__generic vector QuadReadAcrossD SLANG_RAW("__generic matrix QuadReadAcrossDiagonal(matrix localValue);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupAnd($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveAnd(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveBitAnd(T expr);\n") SLANG_RAW("__generic vector WaveActiveBitAnd(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveBitAnd(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupOr($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveOr(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveBitOr(T expr);\n") SLANG_RAW("__generic vector WaveActiveBitOr(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveBitOr(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupXor($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveXor(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveBitXor(T expr);\n") SLANG_RAW("__generic vector WaveActiveBitXor(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveBitXor(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupMax($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveMax(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveMax(T expr);\n") SLANG_RAW("__generic vector WaveActiveMax(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveMax(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupMin($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveMin(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveMin(T expr);\n") SLANG_RAW("__generic vector WaveActiveMin(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveMin(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupMul($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveProduct(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveProduct(T expr);\n") SLANG_RAW("__generic vector WaveActiveProduct(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveProduct(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupAdd($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveSum(__activemask(), $0)\")\n") SLANG_RAW("T WaveActiveSum(T expr);\n") SLANG_RAW("__generic vector WaveActiveSum(vector expr);\n") SLANG_RAW("__generic matrix WaveActiveSum(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_vote)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupAllEqual($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveAllEqual(__activemask(), $0)\")\n") SLANG_RAW("bool WaveActiveAllEqual(T value);\n") SLANG_RAW("__generic vector WaveActiveAllEqual(vector value);\n") @@ -1528,24 +1547,40 @@ SLANG_RAW("// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/\n") SLANG_RAW("// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask()\n") SLANG_RAW("// seems to be appropriate.\n") SLANG_RAW("\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_vote)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupAll($0)\") \n") SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n") SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n") +SLANG_RAW("\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_vote)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupAny($0)\") \n") SLANG_RAW("__target_intrinsic(cuda, \"(__any_sync(__activemask(), $0) != 0)\")\n") SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n") SLANG_RAW("\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_ballot)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupBallot($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n") SLANG_RAW("uint4 WaveActiveBallot(bool condition);\n") SLANG_RAW("\n") +SLANG_RAW("// TODO(JS): \n") +SLANG_RAW("// subgroupBallotBitCount seems to take a uint4 parameter. \n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_ballot)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupBallotBitCount($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0))\")\n") SLANG_RAW("uint WaveActiveCountBits(bool value);\n") SLANG_RAW("\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_basic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"gl_SubgroupSize\")\n") SLANG_RAW("__target_intrinsic(cuda, \"(warpSize)\")\n") SLANG_RAW("uint WaveGetLaneCount();\n") SLANG_RAW("\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_basic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"gl_SubgroupInvocationID\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n") SLANG_RAW("uint WaveGetLaneIndex();\n") SLANG_RAW("\n") -SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_basic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupElect()\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveIsFirstLane()\")\n") SLANG_RAW("bool WaveIsFirstLane();\n") SLANG_RAW("\n") @@ -1553,11 +1588,15 @@ SLANG_RAW("// TODO(JS): We cannot calculate prefix sums using a mask of __active SLANG_RAW("// that would mean different lanes having a different mask, and they all have to have the same mask.\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupExcusiveMul($0)\")\n") SLANG_RAW("T WavePrefixProduct(T expr);\n") SLANG_RAW("__generic vector WavePrefixProduct(vector expr);\n") SLANG_RAW("__generic matrix WavePrefixProduct(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_arithmetic)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupExcusiveAdd($0)\")\n") SLANG_RAW("T WavePrefixSum(T expr);\n") SLANG_RAW("__generic vector WavePrefixSum(vector expr);\n") SLANG_RAW("__generic matrix WavePrefixSum(matrix expr);\n") @@ -1575,7 +1614,10 @@ SLANG_RAW("T WaveMultiPrefixBitXor(T expr);\n") SLANG_RAW("__generic vector WaveMultiPrefixBitXor(vector expr);\n") SLANG_RAW("__generic matrix WaveMultiPrefixBitXor(matrix expr);\n") SLANG_RAW("\n") -SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())\")\n") +SLANG_RAW("// TODO(JS): This takes uvec4 parameter on GLSL\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_ballot)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupBallotExclusiveBitCount($0)\")\n") +SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0) & _getLaneLtMask())\")\n") SLANG_RAW("uint WavePrefixCountBits(bool value);\n") SLANG_RAW("\n") SLANG_RAW("uint WaveMultiPrefixCountBits(bool value, uint4 mask);\n") @@ -1589,12 +1631,16 @@ SLANG_RAW("__generic vector WaveM SLANG_RAW("__generic matrix WaveMultiPrefixSum(matrix value, uint4 mask);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_ballot)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgoupBroadcastFirst($0)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"_waveReadFirst($0)\")\n") SLANG_RAW("T WaveReadLaneFirst(T expr);\n") SLANG_RAW("__generic vector WaveReadLaneFirst(vector expr);\n") SLANG_RAW("__generic matrix WaveReadLaneFirst(matrix expr);\n") SLANG_RAW("\n") SLANG_RAW("__generic\n") +SLANG_RAW("__glsl_extension(GL_KHR_shader_subgroup_ballot)\n") +SLANG_RAW("__target_intrinsic(glsl, \"subgroupBroadcast($0, $1)\")\n") SLANG_RAW("__target_intrinsic(cuda, \"__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)\")\n") SLANG_RAW("T WaveReadLaneAt(T value, int lane);\n") SLANG_RAW("__generic vector WaveReadLaneAt(vector value, int lane);\n") @@ -1686,7 +1732,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa) sb << "};\n"; } -SLANG_RAW("#line 1613 \"hlsl.meta.slang\"") +SLANG_RAW("#line 1659 \"hlsl.meta.slang\"") SLANG_RAW("\n") SLANG_RAW("\n") SLANG_RAW("\n") -- cgit v1.2.3