From 7e0aa9315f7f65033229c1f76d7df47ccd2da3d0 Mon Sep 17 00:00:00 2001 From: jsmall-nvidia Date: Mon, 9 Mar 2020 12:40:04 -0400 Subject: CUDA support for vector/matrix Wave intrinsics (#1266) * Distinguish between __activeMask and _getConvergedMask(). Remove need to pass in mask to CUDA wave impls. * Add support for vector/matrix Wave intrinsics for CUDA. Fix issue with CUDA parsing of errors. * Fix typo. --- source/core/slang-nvrtc-compiler.cpp | 2 +- source/slang/hlsl.meta.slang | 42 +++++++++++++++++++++--------------- 2 files changed, 26 insertions(+), 18 deletions(-) (limited to 'source') diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp index 2f9944786..db4e4f32f 100644 --- a/source/core/slang-nvrtc-compiler.cpp +++ b/source/core/slang-nvrtc-compiler.cpp @@ -204,7 +204,7 @@ static SlangResult _parseNVRTCLine(const UnownedStringSlice& line, DownstreamDia StringUtil::split(line, ':', split); } - if (split.getCount() == 3) + if (split.getCount() >= 3) { // tests/cuda/cuda-compile.cu(7): warning: variable "c" is used before its value is set diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang index d9e40dd4f..39cea9ba3 100644 --- a/source/slang/hlsl.meta.slang +++ b/source/slang/hlsl.meta.slang @@ -2322,123 +2322,134 @@ __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") -__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)") +__target_intrinsic(cuda, "_waveAnd($0)") T WaveActiveBitAnd(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAnd($0)") +__target_intrinsic(cuda, "_waveAndMultiple($0)") vector WaveActiveBitAnd(vector expr); __generic +__target_intrinsic(cuda, "_waveAndMultiple($0)") matrix WaveActiveBitAnd(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") -__target_intrinsic(cuda, "_waveOr(__activemask(), $0)") +__target_intrinsic(cuda, "_waveOr($0)") T WaveActiveBitOr(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupOr($0)") +__target_intrinsic(cuda, "_waveOrMultiple($0)") vector WaveActiveBitOr(vector expr); __generic +__target_intrinsic(cuda, "_waveOrMultiple($0)") matrix WaveActiveBitOr(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") -__target_intrinsic(cuda, "_waveXor(__activemask(), $0)") +__target_intrinsic(cuda, "_waveXor($0)") T WaveActiveBitXor(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupXor($0)") +__target_intrinsic(cuda, "_waveXorMultiple($0)") vector WaveActiveBitXor(vector expr); __generic +__target_intrinsic(cuda, "_waveXorMultiple($0)") matrix WaveActiveBitXor(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") -__target_intrinsic(cuda, "_waveMax(__activemask(), $0)") +__target_intrinsic(cuda, "_waveMax($0)") T WaveActiveMax(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMax($0)") +__target_intrinsic(cuda, "_waveMaxMultiple($0)") vector WaveActiveMax(vector expr); __generic +__target_intrinsic(cuda, "_waveMaxMultiple($0)") matrix WaveActiveMax(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") -__target_intrinsic(cuda, "_waveMin(__activemask(), $0)") +__target_intrinsic(cuda, "_waveMin($0)") T WaveActiveMin(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMin($0)") +__target_intrinsic(cuda, "_waveMinMultiple($0)") vector WaveActiveMin(vector expr); __generic +__target_intrinsic(cuda, "_waveMinMultiple($0)") matrix WaveActiveMin(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") -__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)") +__target_intrinsic(cuda, "_waveProduct($0)") T WaveActiveProduct(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupMul($0)") +__target_intrinsic(cuda, "_waveProductMultiple($0)") vector WaveActiveProduct(vector expr); __generic +__target_intrinsic(cuda, "_waveProductMultiple($0)") matrix WaveActiveProduct(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") -__target_intrinsic(cuda, "_waveSum(__activemask(), $0)") +__target_intrinsic(cuda, "_waveSum($0)") T WaveActiveSum(T expr); __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAdd($0)") +__target_intrinsic(cuda, "_waveSumMultiple($0)") vector WaveActiveSum(vector expr); __generic +__target_intrinsic(cuda, "_waveSumMultiple($0)") matrix WaveActiveSum(matrix expr); __generic __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") -__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)") +__target_intrinsic(cuda, "_waveAllEqual($0)") bool WaveActiveAllEqual(T value); __generic __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAllEqual($0)") +__target_intrinsic(cuda, "_waveAllEqualMultiple($0)") vector WaveActiveAllEqual(vector value); __generic +__target_intrinsic(cuda, "_waveAllEqualMultiple($0)") matrix WaveActiveAllEqual(matrix value); __generic uint4 WaveMatch(T value); __generic uint4 WaveMatch(vector value); __generic uint4 WaveMatch(matrix value); -// TODO(JS): For CUDA the article claims mask has to be used carefully -// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/ -// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask() -// seems to be appropriate. - __glsl_extension(GL_KHR_shader_subgroup_vote) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupAll($0)") @@ -2483,9 +2494,6 @@ __target_intrinsic(glsl, "subgroupElect()") __target_intrinsic(cuda, "_waveIsFirstLane()") bool WaveIsFirstLane(); -// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons) -// that would mean different lanes having a different mask, and they all have to have the same mask. - __generic __glsl_extension(GL_KHR_shader_subgroup_arithmetic) __spirv_version(1.3) @@ -2585,7 +2593,7 @@ __generic __glsl_extension(GL_KHR_shader_subgroup_ballot) __spirv_version(1.3) __target_intrinsic(glsl, "subgroupBroadcast($0, $1)") -__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)") +__target_intrinsic(cuda, "__shfl_sync(_activemask(), $0, $1)") T WaveReadLaneAt(T value, int lane); __generic __spirv_version(1.3) -- cgit v1.2.3