summaryrefslogtreecommitdiffstats
path: root/source/slang
diff options
context:
space:
mode:
authorjsmall-nvidia <jsmall@nvidia.com>2020-03-02 16:18:20 -0500
committerGitHub <noreply@github.com>2020-03-02 16:18:20 -0500
commit8899c149b05def1cce626ea649012c4c974861de (patch)
tree77e97c2997a653ba9262b32f55e9e3f37e166653 /source/slang
parentb85ca6f86d46ee3c4d5784d0bd4ebc8509e2a9bd (diff)
Additional Wave Intrinsic Support (#1252)
* Test for some wave intrinsics. More wave intrinsic support on CUDA. * Use shfl_xor_sync. * Improvements around wave intrinsics. Fix built in integer types belong to __BuiltinIntegerType. * Improvements and fixes around Wave intrinsics. * Added WaveIsFirstLane test. No longer use __wavemask_lt, as appears not available as an intrinsic. * Small fixes to CUDA prelude. * Add wave-active-product test. Handle the special case for arbitray sums. * Used macro to implement CUDA wave intrinsics.
Diffstat (limited to 'source/slang')
-rw-r--r--source/slang/core.meta.slang8
-rw-r--r--source/slang/core.meta.slang.h16
-rw-r--r--source/slang/hlsl.meta.slang58
-rw-r--r--source/slang/hlsl.meta.slang.h62
4 files changed, 105 insertions, 39 deletions
diff --git a/source/slang/core.meta.slang b/source/slang/core.meta.slang
index 70bc90392..6822d304b 100644
--- a/source/slang/core.meta.slang
+++ b/source/slang/core.meta.slang
@@ -132,9 +132,12 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
case BaseType::Half:
case BaseType::Float:
case BaseType::Double:
- sb << "\n , __BuiltinFloatingPointType\n";
+ sb << "\n , __BuiltinFloatingPointType\n";
sb << "\n , __BuiltinRealType\n";
- ; // fall through to:
+ sb << "\n , __BuiltinSignedArithmeticType\n";
+ sb << "\n , __BuiltinArithmeticType\n";
+ sb << "\n , __BuiltinType\n";
+ break;
case BaseType::Int8:
case BaseType::Int16:
case BaseType::Int:
@@ -146,6 +149,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
case BaseType::UInt:
case BaseType::UInt64:
sb << "\n , __BuiltinArithmeticType\n";
+ sb << "\n , __BuiltinIntegerType\n";
; // fall through to:
case BaseType::Bool:
sb << "\n , __BuiltinType\n";
diff --git a/source/slang/core.meta.slang.h b/source/slang/core.meta.slang.h
index 4c8da2a9a..3ff1fd243 100644
--- a/source/slang/core.meta.slang.h
+++ b/source/slang/core.meta.slang.h
@@ -135,9 +135,12 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
case BaseType::Half:
case BaseType::Float:
case BaseType::Double:
- sb << "\n , __BuiltinFloatingPointType\n";
+ sb << "\n , __BuiltinFloatingPointType\n";
sb << "\n , __BuiltinRealType\n";
- ; // fall through to:
+ sb << "\n , __BuiltinSignedArithmeticType\n";
+ sb << "\n , __BuiltinArithmeticType\n";
+ sb << "\n , __BuiltinType\n";
+ break;
case BaseType::Int8:
case BaseType::Int16:
case BaseType::Int:
@@ -149,6 +152,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
case BaseType::UInt:
case BaseType::UInt64:
sb << "\n , __BuiltinArithmeticType\n";
+ sb << "\n , __BuiltinIntegerType\n";
; // fall through to:
case BaseType::Bool:
sb << "\n , __BuiltinType\n";
@@ -195,7 +199,7 @@ for (int tt = 0; tt < kBaseTypeCount; ++tt)
// TODO: should this cover the full gamut of integer types?
case BaseType::Int:
case BaseType::UInt:
-SLANG_RAW("#line 195 \"core.meta.slang\"")
+SLANG_RAW("#line 199 \"core.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW(" __generic<T:__EnumType>\n")
SLANG_RAW(" __init(T value);\n")
@@ -211,7 +215,7 @@ SLANG_RAW(" __init(T value);\n")
// Declare built-in pointer type
// (eventually we can have the traditional syntax sugar for this)
-SLANG_RAW("#line 210 \"core.meta.slang\"")
+SLANG_RAW("#line 214 \"core.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("__generic<T>\n")
@@ -273,7 +277,7 @@ sb << " __init(T value);\n";
sb << " __init(vector<T,N> value);\n";
sb << "};\n";
-SLANG_RAW("#line 256 \"core.meta.slang\"")
+SLANG_RAW("#line 260 \"core.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("__generic<T = float, let R : int = 4, let C : int = 4>\n")
@@ -1509,7 +1513,7 @@ for (auto op : binaryOps)
sb << "__intrinsic_op(" << int(op.opCode) << ") matrix<" << resultType << ",N,M> operator" << op.opName << "(" << leftQual << "matrix<" << leftType << ",N,M> left, " << rightType << " right);\n";
}
}
-SLANG_RAW("#line 1491 \"core.meta.slang\"")
+SLANG_RAW("#line 1495 \"core.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("// Specialized function\n")
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index c8aae9158..edb678ad6 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -1395,35 +1395,51 @@ __generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue);
__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);
-__generic<T : __BuiltinIntegerType> T WaveActiveBitAnd(T expr);
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)")
+T WaveActiveBitAnd(T expr);
__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);
-__generic<T : __BuiltinIntegerType> T WaveActiveBitOr(T expr);
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(cuda, "_waveOr(__activemask(), $0)")
+T WaveActiveBitOr(T expr);
__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);
-__generic<T : __BuiltinIntegerType> T WaveActiveBitXor(T expr);
+__generic<T : __BuiltinIntegerType>
+__target_intrinsic(cuda, "_waveXor(__activemask(), $0)")
+T WaveActiveBitXor(T expr);
__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);
-__generic<T : __BuiltinArithmeticType> T WaveActiveMax(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveMax(__activemask(), $0)")
+T WaveActiveMax(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);
-__generic<T : __BuiltinArithmeticType> T WaveActiveMin(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveMin(__activemask(), $0)")
+T WaveActiveMin(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);
-__generic<T : __BuiltinArithmeticType> T WaveActiveProduct(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)")
+T WaveActiveProduct(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);
-__generic<T : __BuiltinArithmeticType> T WaveActiveSum(T expr);
+__generic<T : __BuiltinArithmeticType>
+__target_intrinsic(cuda, "_waveSum(__activemask(), $0)")
+T WaveActiveSum(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);
-__generic<T : __BuiltinType> bool WaveActiveAllEqual(T value);
+__generic<T : __BuiltinType>
+__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)")
+bool WaveActiveAllEqual(T value);
__generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);
@@ -1438,7 +1454,7 @@ __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,
__target_intrinsic(cuda, "(__all_sync(__activemask(), $0) != 0)")
bool WaveActiveAllTrue(bool condition);
-__target_intrinsic(cuda, "(_any_sync(__activemask(), $0) != 0)")
+__target_intrinsic(cuda, "(__any_sync(__activemask(), $0) != 0)")
bool WaveActiveAnyTrue(bool condition);
__target_intrinsic(cuda, "make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)")
@@ -1454,14 +1470,19 @@ __target_intrinsic(cuda, "_getLaneId()")
uint WaveGetLaneIndex();
// If there are no *active* lanes less than this one, we must be the lowest lane
-__target_intrinsic(cuda, "((__activemask() & __lanemask_lt()) == 0)")
+__target_intrinsic(cuda, "_waveIsFirstLane()")
bool WaveIsFirstLane();
-__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);
+// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons)
+// that would mean different lanes having a different mask, and they all have to have the same mask.
+
+__generic<T : __BuiltinArithmeticType>
+T WavePrefixProduct(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);
-__generic<T : __BuiltinArithmeticType> T WavePrefixSum(T expr);
+__generic<T : __BuiltinArithmeticType>
+T WavePrefixSum(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);
@@ -1473,11 +1494,14 @@ __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitOr(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr);
-__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitXor(T expr);
+__generic<T : __BuiltinArithmeticType>
+T WaveMultiPrefixBitXor(T expr);
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);
+__target_intrinsic(cuda, "__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())")
uint WavePrefixCountBits(bool value);
+
uint WaveMultiPrefixCountBits(bool value, uint4 mask);
__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask);
@@ -1488,11 +1512,15 @@ __generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value, uint4 mask)
__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);
-__generic<T : __BuiltinType> T WaveReadLaneFirst(T expr);
+__generic<T : __BuiltinType>
+__target_intrinsic(cuda, "_waveReadFirst($0)")
+T WaveReadLaneFirst(T expr);
__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);
-__generic<T : __BuiltinType> T WaveReadLaneAt(T value, int lane);
+__generic<T : __BuiltinType>
+__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)")
+T WaveReadLaneAt(T value, int lane);
__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);
diff --git a/source/slang/hlsl.meta.slang.h b/source/slang/hlsl.meta.slang.h
index 69349d9dc..16a3244ab 100644
--- a/source/slang/hlsl.meta.slang.h
+++ b/source/slang/hlsl.meta.slang.h
@@ -1471,35 +1471,51 @@ SLANG_RAW("__generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue);\
SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitAnd(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinIntegerType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveAnd(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveBitAnd(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitAnd(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitOr(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinIntegerType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveOr(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveBitOr(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitOr(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinIntegerType> T WaveActiveBitXor(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinIntegerType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveXor(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveBitXor(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int> vector<T,N> WaveActiveBitXor(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinIntegerType, let N : int, let M : int> matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveMax(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveMax(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveMax(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMax(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveMin(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveMin(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveMin(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveMin(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveProduct(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveProduct(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveProduct(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveProduct(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveActiveSum(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveSum(__activemask(), $0)\")\n")
+SLANG_RAW("T WaveActiveSum(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveActiveSum(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinType> bool WaveActiveAllEqual(T value);\n")
+SLANG_RAW("__generic<T : __BuiltinType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveAllEqual(__activemask(), $0)\")\n")
+SLANG_RAW("bool WaveActiveAllEqual(T value);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<bool,N> WaveActiveAllEqual(vector<T,N> value);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);\n")
SLANG_RAW("\n")
@@ -1514,7 +1530,7 @@ SLANG_RAW("// seems to be appropriate.\n")
SLANG_RAW("\n")
SLANG_RAW("__target_intrinsic(cuda, \"(__all_sync(__activemask(), $0) != 0)\") \n")
SLANG_RAW("bool WaveActiveAllTrue(bool condition);\n")
-SLANG_RAW("__target_intrinsic(cuda, \"(_any_sync(__activemask(), $0) != 0)\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"(__any_sync(__activemask(), $0) != 0)\")\n")
SLANG_RAW("bool WaveActiveAnyTrue(bool condition);\n")
SLANG_RAW("\n")
SLANG_RAW("__target_intrinsic(cuda, \"make_uint4(__ballot_sync(__activemask(), $0), 0, 0, 0)\")\n")
@@ -1530,14 +1546,19 @@ SLANG_RAW("__target_intrinsic(cuda, \"_getLaneId()\")\n")
SLANG_RAW("uint WaveGetLaneIndex();\n")
SLANG_RAW("\n")
SLANG_RAW("// If there are no *active* lanes less than this one, we must be the lowest lane\n")
-SLANG_RAW("__target_intrinsic(cuda, \"((__activemask() & __lanemask_lt()) == 0)\")\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveIsFirstLane()\")\n")
SLANG_RAW("bool WaveIsFirstLane();\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixProduct(T expr);\n")
+SLANG_RAW("// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons)\n")
+SLANG_RAW("// that would mean different lanes having a different mask, and they all have to have the same mask.\n")
+SLANG_RAW("\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("T WavePrefixProduct(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixProduct(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixProduct(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WavePrefixSum(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("T WavePrefixSum(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WavePrefixSum(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
@@ -1549,11 +1570,14 @@ SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitOr(T expr)
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixBitXor(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinArithmeticType>\n")
+SLANG_RAW("T WaveMultiPrefixBitXor(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__popc(__ballot_sync(__activemask(), $0) & __lanemask_lt())\")\n")
SLANG_RAW("uint WavePrefixCountBits(bool value);\n")
+SLANG_RAW("\n")
SLANG_RAW("uint WaveMultiPrefixCountBits(bool value, uint4 mask);\n")
SLANG_RAW("\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixProduct(T value, uint4 mask);\n")
@@ -1564,11 +1588,15 @@ SLANG_RAW("__generic<T : __BuiltinArithmeticType> T WaveMultiPrefixSum(T value,
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int> vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);\n")
SLANG_RAW("__generic<T : __BuiltinArithmeticType, let N : int, let M : int> matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinType> T WaveReadLaneFirst(T expr);\n")
+SLANG_RAW("__generic<T : __BuiltinType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"_waveReadFirst($0)\")\n")
+SLANG_RAW("T WaveReadLaneFirst(T expr);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneFirst(vector<T,N> expr);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr);\n")
SLANG_RAW("\n")
-SLANG_RAW("__generic<T : __BuiltinType> T WaveReadLaneAt(T value, int lane);\n")
+SLANG_RAW("__generic<T : __BuiltinType>\n")
+SLANG_RAW("__target_intrinsic(cuda, \"__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)\")\n")
+SLANG_RAW("T WaveReadLaneAt(T value, int lane);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int> vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane);\n")
SLANG_RAW("__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> WaveReadLaneAt(matrix<T,N,M> value, int lane);\n")
SLANG_RAW("\n")
@@ -1658,7 +1686,7 @@ for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
sb << "};\n";
}
-SLANG_RAW("#line 1585 \"hlsl.meta.slang\"")
+SLANG_RAW("#line 1613 \"hlsl.meta.slang\"")
SLANG_RAW("\n")
SLANG_RAW("\n")
SLANG_RAW("\n")
@@ -1682,6 +1710,8 @@ SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x1
SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20;\n")
SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_OPAQUE = 0x40;\n")
SLANG_RAW("static const RAY_FLAG RAY_FLAG_CULL_NON_OPAQUE = 0x80;\n")
+SLANG_RAW("static const RAY_FLAG RAY_FLAG_SKIP_TRIANGLES = 0x100;\n")
+SLANG_RAW("static const RAY_FLAG RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES = 0x200;\n")
SLANG_RAW("\n")
SLANG_RAW("// 10.1.2 - Ray Description Structure\n")
SLANG_RAW("\n")