CUDA support for vector/matrix Wave intrinsics (#1266)

* Distinguish between __activeMask and _getConvergedMask(). Remove need to pass in mask to CUDA wave impls. * Add support for vector/matrix Wave intrinsics for CUDA. Fix issue with CUDA parsing of errors. * Fix typo.
author: jsmall-nvidia <jsmall@nvidia.com> 2020-03-09 12:40:04 -0400
committer: GitHub <noreply@github.com> 2020-03-09 12:40:04 -0400
commit: 7e0aa9315f7f65033229c1f76d7df47ccd2da3d0 (patch)
tree: 28ca885d901526ae548895f354626844d305d16f
parent: b1317cd16ab9c827596a28ccf4258ef1bb672d92 (diff)
5 files changed, 191 insertions, 36 deletions
diff --git a/prelude/slang-cuda-prelude.h b/prelude/slang-cuda-prelude.h
index 90e00a631..5f0dffd5c 100644
--- a/prelude/slang-cuda-prelude.h
+++ b/prelude/slang-cuda-prelude.h
@@ -441,6 +441,33 @@ __forceinline__ __device__ uint32_t _getLaneId()
 }
 #endif
 
+// It appears that the __activemask() cannot always be used because 
+// threads need to be converged. 
+// 
+// For CUDA the article claims mask has to be used carefully
+// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
+// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. 
+// __activemask() though does not require there is convergence, so that doesn't work.
+// 
+// '__ballot_sync' produces a convergance. 
+// 
+// From the CUDA docs:
+// ```For __all_sync, __any_sync, and __ballot_sync, a mask must be passed that specifies the threads 
+// participating in the call. A bit, representing the thread's lane ID, must be set for each participating thread 
+// to ensure they are properly converged before the intrinsic is executed by the hardware. All active threads named 
+// in mask must execute the same intrinsic with the same mask, or the result is undefined.```
+//
+// To get the right results we need to use the __activemask() within _ballot_sync it seems.
+// 
+// Also note that __all_sync and __any_sync are listed with __ballot_sync. That if they have a similar synchronizing behavior
+// we can use __activemask() there (instead of _getConvergedMask), because they will converge too. 
+__forceinline__ __device__ int _getConvergedMask()
+{
+    //return __activemask(); 
+    //return __ballot_sync(SLANG_CUDA_WARP_MASK, true);
+    return __ballot_sync(__activemask(), true);
+}
+
 // Return mask of all the lanes less than the current lane
 __forceinline__ __device__ int _getLaneLtMask()
 {
@@ -488,7 +515,6 @@ __inline__ __device__ bool _waveIsFirstLane()
     return (mask & 1 ) || ((__ffs(mask) - 1) == _getLaneId());
 }
 
-
 template <typename T>
 struct WaveOpOr
 {
@@ -538,24 +564,63 @@ struct WaveOpMin
     __inline__ __device__ static T doOp(T a, T b) { return a < b ? a : b; }
 };
 
+template <typename T>
+struct ElementTypeTrait;
+
+// Scalar
+template <> struct ElementTypeTrait<int> { typedef int Type; };
+template <> struct ElementTypeTrait<uint> { typedef uint Type; };
+template <> struct ElementTypeTrait<float> { typedef float Type; };
+template <> struct ElementTypeTrait<double> { typedef double Type; };
+template <> struct ElementTypeTrait<uint64_t> { typedef uint64_t Type; };
+template <> struct ElementTypeTrait<int64_t> { typedef int64_t Type; };
+
+// Vector
+template <> struct ElementTypeTrait<int1> { typedef int Type; };
+template <> struct ElementTypeTrait<int2> { typedef int Type; };
+template <> struct ElementTypeTrait<int3> { typedef int Type; };
+template <> struct ElementTypeTrait<int4> { typedef int Type; };
+
+template <> struct ElementTypeTrait<uint1> { typedef uint Type; };
+template <> struct ElementTypeTrait<uint2> { typedef uint Type; };
+template <> struct ElementTypeTrait<uint3> { typedef uint Type; };
+template <> struct ElementTypeTrait<uint4> { typedef uint Type; };
+
+template <> struct ElementTypeTrait<float1> { typedef float Type; };
+template <> struct ElementTypeTrait<float2> { typedef float Type; };
+template <> struct ElementTypeTrait<float3> { typedef float Type; };
+template <> struct ElementTypeTrait<float4> { typedef float Type; };
+
+template <> struct ElementTypeTrait<double1> { typedef double Type; };
+template <> struct ElementTypeTrait<double2> { typedef double Type; };
+template <> struct ElementTypeTrait<double3> { typedef double Type; };
+template <> struct ElementTypeTrait<double4> { typedef double Type; };
+
+// Matrix
+template <typename T, int ROWS, int COLS> 
+struct ElementTypeTrait<Matrix<T, ROWS, COLS> >  
+{ 
+    typedef T Type; 
+};
+
 // Scalar 
 template <typename INTF, typename T>
-__device__ T _waveReduce(int mask, T val)
+__device__ T _waveReduceScalar(T val)
 {
+    // The shuffles appear to converge on set bits, so it appears ok to use __activemask()
+    //const int mask = _getConvergedMask();
+    const int mask = __activemask();
+    
     const int offsetSize = _waveCalcPow2Offset(mask);
     if (offsetSize > 0)
     {
+        // Fast path O(log2(activeLanes)) 
         for (int offset = offsetSize >> 1; offset > 0; offset >>= 1)
         {
             val = INTF::doOp(val, __shfl_xor_sync(mask, val, offset));
         }
-        return val;
-    }
-    else if (_waveIsSingleLane(mask))
-    {
-        return val;
     }
-    else
+    else if (!_waveIsSingleLane(mask))
     {
         T result = INTF::getInitial(val);
         int remaining = mask;
@@ -570,33 +635,115 @@ __device__ T _waveReduce(int mask, T val)
         }
         return result;
     }
+    return val;
+}
+
+
+// Multiple values
+template <typename INTF, typename T, size_t COUNT>
+__device__ void _waveReduceMultiple(T* val)
+{
+    // The shuffles appear to converge on set bits, so it appears ok to use __activemask()
+    //const int mask = _getConvergedMask();
+    const int mask = __activemask();
+    
+    const int offsetSize = _waveCalcPow2Offset(mask);
+    if (offsetSize > 0)
+    {
+        // Fast path O(log2(activeLanes)) 
+        for (int offset = offsetSize >> 1; offset > 0; offset >>= 1)
+        {
+            for (size_t i = 0; i < COUNT; ++i)
+            {
+                val[i] = INTF::doOp(val[i], __shfl_xor_sync(mask, val[i], offset));
+            }
+        }
+    }
+    else if (!_waveIsSingleLane(mask))
+    {
+        // Copy the original
+        T originalVal[COUNT];
+        for (size_t i = 0; i < COUNT; ++i)
+        {
+            const T v = val[i];
+            originalVal[i] = v;
+            val[i] = INTF::getInitial(v);
+        }
+        
+        int remaining = mask;
+        while (remaining)
+        {
+            const int laneBit = remaining & -remaining;
+            /* Get the sourceLane */
+            const int srcLane = __ffs(laneBit) - 1;
+            /* Broadcast (can also broadcast to self) */
+            for (size_t i = 0; i < COUNT; ++i)
+            {
+                val[i] = INTF::doOp(val[i], __shfl_sync(mask, originalVal[i], srcLane));
+            }
+            remaining &= ~laneBit;
+        }
+    }
+}
+
+template <typename INTF, typename T>
+__device__ void _waveReduceMultiple(T* val)
+{
+    typedef typename ElementTypeTrait<T>::Type ElemType;    
+    _waveReduceMultiple<INTF, ElemType, sizeof(T) / sizeof(ElemType)>((ElemType*)val);
 }
 
 template <typename T>
-__inline__ __device__  T _waveOr(int mask, T val) { return _waveReduce<WaveOpOr<T>, T>(mask, val); }
+__inline__ __device__  T _waveOr(T val) { return _waveReduceScalar<WaveOpOr<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveAnd(T val) { return _waveReduceScalar<WaveOpAnd<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveXor(T val) { return _waveReduceScalar<WaveOpXor<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveProduct(T val) { return _waveReduceScalar<WaveOpMul<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveSum(T val) { return _waveReduceScalar<WaveOpAdd<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveMin(T val) { return _waveReduceScalar<WaveOpMin<T>, T>(val); }
+
+template <typename T>
+__inline__ __device__ T _waveMax(T val) { return _waveReduceScalar<WaveOpMax<T>, T>(val); }
+
+
+// Multiple
+
+template <typename T>
+__inline__ __device__  T _waveOrMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpOr<ElemType> >(&val); return val; }
 
 template <typename T>
-__inline__ __device__ T _waveAnd(int mask, T val) { return _waveReduce<WaveOpAnd<T>, T>(mask, val); }
+__inline__ __device__  T _waveAndMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAnd<ElemType> >(&val); return val; }
 
 template <typename T>
-__inline__ __device__ T _waveXor(int mask, T val) { return _waveReduce<WaveOpXor<T>, T>(mask, val); }
+__inline__ __device__  T _waveXorMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpXor<ElemType> >(&val); return val; }
 
 template <typename T>
-__inline__ __device__ T _waveProduct(int mask, T val) { return _waveReduce<WaveOpMul<T>, T>(mask, val); }
+__inline__ __device__  T _waveProductMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMul<ElemType> >(&val); return val; }
 
 template <typename T>
-__inline__ __device__ T _waveSum(int mask, T val) { return _waveReduce<WaveOpAdd<T>, T>(mask, val); }
+__inline__ __device__  T _waveSumMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpAdd<ElemType> >(&val); return val; }
 
 template <typename T>
-__inline__ __device__ T _waveMin(int mask, T val) { return _waveReduce<WaveOpMin<T>, T>(mask, val); }
+__inline__ __device__  T _waveMinMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMin<ElemType> >(&val); return val; }
 
 template <typename T>
-__inline__ __device__ T _waveMax(int mask, T val) { return _waveReduce<WaveOpMax<T>, T>(mask, val); }
+__inline__ __device__  T _waveMaxMultiple(T val) { typedef typename ElementTypeTrait<T>::Type ElemType; _waveReduceMultiple<WaveOpMax<ElemType> >(&val); return val; }
 
 
 template <typename T>
-__inline__ __device__ bool _waveAllEqual(int mask, T val) 
+__inline__ __device__ bool _waveAllEqual(T val) 
 {
+    // __match_all_sync is a synchronises so can use __activemask()
+    const int mask = __activemask();
     int pred;
     __match_all_sync(mask, val, &pred);
     return pred != 0;
diff --git a/source/core/slang-nvrtc-compiler.cpp b/source/core/slang-nvrtc-compiler.cpp
index 2f9944786..db4e4f32f 100644
--- a/source/core/slang-nvrtc-compiler.cpp
+++ b/source/core/slang-nvrtc-compiler.cpp
@@ -204,7 +204,7 @@ static SlangResult _parseNVRTCLine(const UnownedStringSlice& line, DownstreamDia
         StringUtil::split(line, ':', split);
     }
 
-    if (split.getCount() == 3)
+    if (split.getCount() >= 3)
     {
         // tests/cuda/cuda-compile.cu(7): warning: variable "c" is used before its value is set
 
diff --git a/source/slang/hlsl.meta.slang b/source/slang/hlsl.meta.slang
index d9e40dd4f..39cea9ba3 100644
--- a/source/slang/hlsl.meta.slang
+++ b/source/slang/hlsl.meta.slang
@@ -2322,123 +2322,134 @@ __generic<T : __BuiltinIntegerType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAnd($0)")
-__target_intrinsic(cuda, "_waveAnd(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveAnd($0)")
 T WaveActiveBitAnd(T expr);
 __generic<T : __BuiltinIntegerType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAnd($0)")
+__target_intrinsic(cuda, "_waveAndMultiple($0)")
 vector<T,N> WaveActiveBitAnd(vector<T,N> expr);
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveAndMultiple($0)")
 matrix<T,N,M> WaveActiveBitAnd(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinIntegerType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupOr($0)")
-__target_intrinsic(cuda, "_waveOr(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveOr($0)")
 T WaveActiveBitOr(T expr);
 __generic<T : __BuiltinIntegerType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupOr($0)")
+__target_intrinsic(cuda, "_waveOrMultiple($0)")
 vector<T,N> WaveActiveBitOr(vector<T,N> expr);
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveOrMultiple($0)")
 matrix<T,N,M> WaveActiveBitOr(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinIntegerType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupXor($0)")
-__target_intrinsic(cuda, "_waveXor(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveXor($0)")
 T WaveActiveBitXor(T expr);
 __generic<T : __BuiltinIntegerType, let N : int> 
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupXor($0)")
+__target_intrinsic(cuda, "_waveXorMultiple($0)")
 vector<T,N> WaveActiveBitXor(vector<T,N> expr);
 __generic<T : __BuiltinIntegerType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveXorMultiple($0)")
 matrix<T,N,M> WaveActiveBitXor(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinArithmeticType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupMax($0)")
-__target_intrinsic(cuda, "_waveMax(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveMax($0)")
 T WaveActiveMax(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupMax($0)")
+__target_intrinsic(cuda, "_waveMaxMultiple($0)")
 vector<T,N> WaveActiveMax(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveMaxMultiple($0)")
 matrix<T,N,M> WaveActiveMax(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinArithmeticType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupMin($0)")
-__target_intrinsic(cuda, "_waveMin(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveMin($0)")
 T WaveActiveMin(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupMin($0)")
+__target_intrinsic(cuda, "_waveMinMultiple($0)")
 vector<T,N> WaveActiveMin(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveMinMultiple($0)")
 matrix<T,N,M> WaveActiveMin(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinArithmeticType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupMul($0)")
-__target_intrinsic(cuda, "_waveProduct(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveProduct($0)")
 T WaveActiveProduct(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupMul($0)")
+__target_intrinsic(cuda, "_waveProductMultiple($0)")
 vector<T,N> WaveActiveProduct(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveProductMultiple($0)")
 matrix<T,N,M> WaveActiveProduct(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinArithmeticType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAdd($0)")
-__target_intrinsic(cuda, "_waveSum(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveSum($0)")
 T WaveActiveSum(T expr);
 __generic<T : __BuiltinArithmeticType, let N : int>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAdd($0)")
+__target_intrinsic(cuda, "_waveSumMultiple($0)")
 vector<T,N> WaveActiveSum(vector<T,N> expr);
 __generic<T : __BuiltinArithmeticType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveSumMultiple($0)")
 matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr);
 
 __generic<T : __BuiltinType>
 __glsl_extension(GL_KHR_shader_subgroup_vote)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAllEqual($0)")
-__target_intrinsic(cuda, "_waveAllEqual(__activemask(), $0)")
+__target_intrinsic(cuda, "_waveAllEqual($0)")
 bool WaveActiveAllEqual(T value);
 __generic<T : __BuiltinType, let N : int> 
 __glsl_extension(GL_KHR_shader_subgroup_vote)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAllEqual($0)")
+__target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
 vector<bool,N> WaveActiveAllEqual(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int>
+__target_intrinsic(cuda, "_waveAllEqualMultiple($0)")
 matrix<bool,N,M> WaveActiveAllEqual(matrix<T,N,M> value);
 
 __generic<T : __BuiltinType> uint4 WaveMatch(T value);
 __generic<T : __BuiltinType, let N : int> uint4 WaveMatch(vector<T,N> value);
 __generic<T : __BuiltinType, let N : int, let M : int> uint4 WaveMatch(matrix<T,N,M> value);
 
-// TODO(JS): For CUDA the article claims mask has to be used carefully
-// https://devblogs.nvidia.com/using-cuda-warp-level-primitives/
-// With the Warp intrinsics there is no mask, and it's just the 'active lanes'. So __activemask()
-// seems to be appropriate.
-
 __glsl_extension(GL_KHR_shader_subgroup_vote)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupAll($0)") 
@@ -2483,9 +2494,6 @@ __target_intrinsic(glsl, "subgroupElect()")
 __target_intrinsic(cuda, "_waveIsFirstLane()")
 bool WaveIsFirstLane();
 
-// TODO(JS): We cannot calculate prefix sums using a mask of __activemask() & __lanemask_lt(), because (amongst other reasons)
-// that would mean different lanes having a different mask, and they all have to have the same mask.
-
 __generic<T : __BuiltinArithmeticType>
 __glsl_extension(GL_KHR_shader_subgroup_arithmetic)
 __spirv_version(1.3)
@@ -2585,7 +2593,7 @@ __generic<T : __BuiltinType>
 __glsl_extension(GL_KHR_shader_subgroup_ballot)
 __spirv_version(1.3)
 __target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
-__target_intrinsic(cuda, "__shfl_sync(SLANG_CUDA_WARP_MASK, $0, $1)")
+__target_intrinsic(cuda, "__shfl_sync(_activemask(), $0, $1)")
 T WaveReadLaneAt(T value, int lane);
 __generic<T : __BuiltinType, let N : int>
 __spirv_version(1.3)
diff --git a/tests/hlsl-intrinsic/wave-matrix.slang b/tests/hlsl-intrinsic/wave-matrix.slang
index 022182164..b5af69f5d 100644
--- a/tests/hlsl-intrinsic/wave-matrix.slang
+++ b/tests/hlsl-intrinsic/wave-matrix.slang
@@ -2,7 +2,7 @@
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
 //DISABLE_TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
 
 //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
 RWStructuredBuffer<int> outputBuffer;
diff --git a/tests/hlsl-intrinsic/wave-vector.slang b/tests/hlsl-intrinsic/wave-vector.slang
index 808f0c5f6..8d2868600 100644
--- a/tests/hlsl-intrinsic/wave-vector.slang
+++ b/tests/hlsl-intrinsic/wave-vector.slang
@@ -2,7 +2,7 @@
 //DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-slang -compute
 //TEST(compute):COMPARE_COMPUTE_EX:-slang -compute -dx12 -use-dxil -profile cs_6_0
 //TEST(compute, vulkan):COMPARE_COMPUTE_EX:-vk -compute
-//DISABLE_TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
+//TEST(compute):COMPARE_COMPUTE_EX:-cuda -compute
 
 //TEST_INPUT:ubuffer(data=[0 0 0 0 0 0 0 0], stride=4):out,name outputBuffer
 RWStructuredBuffer<int> outputBuffer;
author	jsmall-nvidia <jsmall@nvidia.com>	2020-03-09 12:40:04 -0400
committer	GitHub <noreply@github.com>	2020-03-09 12:40:04 -0400
commit	7e0aa9315f7f65033229c1f76d7df47ccd2da3d0 (patch)
tree	28ca885d901526ae548895f354626844d305d16f
parent	b1317cd16ab9c827596a28ccf4258ef1bb672d92 (diff)