Performance improvement, `softMax` shader

author: Konstantin <const@const.me> 2023-01-23 14:38:12 +0100
committer: Konstantin <const@const.me> 2023-01-23 14:38:12 +0100
commit: 27dfc3428a7016e2d05dd67b6d8b88c0b982baa9 (patch)
tree: f969d54ebfb266ecf61285a039295a1da37200a0 /ComputeShaders
parent: 01aba39f15a03ed96e034ffc3b6ee9ec12294b0d (diff)
5 files changed, 55 insertions, 87 deletions
diff --git a/ComputeShaders/ComputeShaders.vcxproj b/ComputeShaders/ComputeShaders.vcxproj
index 1d9343d..300974a 100644
--- a/ComputeShaders/ComputeShaders.vcxproj
+++ b/ComputeShaders/ComputeShaders.vcxproj
@@ -202,6 +202,7 @@
     <FxCompile Include="softMax64.hlsl" />
     <FxCompile Include="softMaxCompat.hlsl" />
     <FxCompile Include="softMaxFixed.hlsl" />
+    <FxCompile Include="softMaxLong.hlsl" />
     <FxCompile Include="zeroMemory.hlsl" />
   </ItemGroup>
   <ItemGroup>
diff --git a/ComputeShaders/ComputeShaders.vcxproj.filters b/ComputeShaders/ComputeShaders.vcxproj.filters
index b827710..de9f12f 100644
--- a/ComputeShaders/ComputeShaders.vcxproj.filters
+++ b/ComputeShaders/ComputeShaders.vcxproj.filters
@@ -51,6 +51,7 @@
     <FxCompile Include="matReshapePanels.hlsl" />
     <FxCompile Include="mulMatByRowTiledEx.hlsl" />
     <FxCompile Include="addRepeatEx.hlsl" />
+    <FxCompile Include="softMaxLong.hlsl" />
   </ItemGroup>
   <ItemGroup>
     <None Include="componentwiseBinaryOp.hlsli" />
diff --git a/ComputeShaders/softMax.hlsl b/ComputeShaders/softMax.hlsl
index 6ebd0f2..259e457 100644
--- a/ComputeShaders/softMax.hlsl
+++ b/ComputeShaders/softMax.hlsl
@@ -1,9 +1,6 @@
 // Dispatch [ nr, 1, 1 ] thread groups of this shader
 RWBuffer<float> result: register( u0 );
 
-// table_exp_f16
-Buffer<uint> lookupTable: register( t0 );
-
 cbuffer Constants: register( b0 )
 {
 	uint4 elements: packoffset( c0 );
@@ -12,12 +9,50 @@ cbuffer Constants: register( b0 )
 	float inputScale: packoffset( c2.y );
 }
 
-#include "miscUtils.hlsli"
-#include "groupReduce.hlsli"
+#ifndef THREADS
+static const uint THREADS = 32;
+#endif
+
+groupshared float sharedAccumulators[ THREADS ];
+
+// Compute horizontal maximum of the numbers, and broadcast to all threads of the group.
+void horizontalMaxBroadcast( const uint thread, inout float ax )
+{
+	sharedAccumulators[ thread ] = ax;
+	for( uint i = THREADS / 2; i > 0; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			ax = max( ax, sharedAccumulators[ thread + i ] );
+			sharedAccumulators[ thread ] = ax;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	ax = sharedAccumulators[ 0 ];
+}
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+	sharedAccumulators[ thread ] = sum;
+	for( uint i = THREADS / 2; i > 1; i /= 2 )
+	{
+		GroupMemoryBarrierWithGroupSync();
+		if( thread < i )
+		{
+			sum += sharedAccumulators[ thread + i ];
+			sharedAccumulators[ thread ] = sum;
+		}
+	}
+	GroupMemoryBarrierWithGroupSync();
+	if( 0 == thread )
+		sum += sharedAccumulators[ 1 ];
+}
 
 static const float negativeInfinity = asfloat( 0xff800000 );
 
-[ numthreads( 32, 1, 1 ) ]
+[numthreads( THREADS, 1, 1 )]
 void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
 {
 	const uint p = group.x * strides[ 1 ];
@@ -26,12 +61,12 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
 	uint i;
 
 	float m = negativeInfinity;
-	for( i = p + thread; i < pEnd; i += 32 )
+	for( i = p + thread; i < pEnd; i += THREADS )
 		m = max( m, result[ i ] );
 	horizontalMaxBroadcast( thread, m );
 
 	float sum = 0;
-	for( i = p + thread; i < pEnd; i += 32 )
+	for( i = p + thread; i < pEnd; i += THREADS )
 	{
 		float f = result[ i ];
 
@@ -39,14 +74,8 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
 		if( f != negativeInfinity )
 		{
 			f = ( f - m ) * inputScale;
-#if 1
-			// Similar to Radeon Graphics, computing the exponent on nVidia 1080Ti is also slightly faster than loading from the lookup table
+			// On both Radeon Graphics and nVidia 1080Ti, computing the exponent is slightly faster than loading from the lookup table
 			f = exp( f );
-#else
-			uint s = fp16Rounded( f );
-			s = lookupTable[ s ];
-			f = f16tof32( s );
-#endif
 			sum += f;
 		}
 		else
@@ -62,7 +91,7 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
 	const float scale = sharedAccumulators[ 0 ];
 
 	// ggml_vec_scale_f32
-	for( i = p + thread; i < pEnd; i += 32 )
+	for( i = p + thread; i < pEnd; i += THREADS )
 	{
 		float f = result[ i ];
 		f *= scale;
diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl
index 7ecd2ef..f73551e 100644
--- a/ComputeShaders/softMax64.hlsl
+++ b/ComputeShaders/softMax64.hlsl
@@ -1,71 +1,2 @@
-// Dispatch [ nr, 1, 1 ] thread groups of this shader
-RWBuffer<float> result: register( u0 );
-
-// table_exp_f16
-Buffer<uint> lookupTable: register( t0 );
-
-cbuffer Constants: register( b0 )
-{
-	uint4 elements: packoffset( c0 );
-	uint4 strides: packoffset( c1 );
-	uint nr: packoffset( c2.x );
-	float inputScale: packoffset( c2.y );
-}
-
-#include "miscUtils.hlsli"
-#include "groupReduce64.hlsli"
-
-static const float negativeInfinity = asfloat( 0xff800000 );
-
-[ numthreads( 64, 1, 1 ) ]
-void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
-{
-	const uint p = group.x * strides[ 1 ];
-	const uint nc = elements[ 0 ];
-	const uint pEnd = p + nc;
-	uint i;
-
-	float m = negativeInfinity;
-	for( i = p + thread; i < pEnd; i += 64 )
-		m = max( m, result[ i ] );
-	horizontalMaxBroadcast( thread, m );
-
-	float sum = 0;
-	for( i = p + thread; i < pEnd; i += 64 )
-	{
-		float f = result[ i ];
-
-		[branch]
-		if( f != negativeInfinity )
-		{
-			f = ( f - m ) * inputScale;
-#if 1
-			// At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance
-			f = exp( f );
-#else
-			uint s = fp16Rounded( f );
-			s = lookupTable[ s ];
-			f = f16tof32( s );
-#endif
-			sum += f;
-		}
-		else
-			f = 0;
-
-		result[ i ] = f;
-	}
-
-	horizontalSum( thread, sum );
-	if( 0 == thread )
-		sharedAccumulators[ 0 ] = 1.0 / sum;
-	GroupMemoryBarrierWithGroupSync();
-	const float scale = sharedAccumulators[ 0 ];
-
-	// ggml_vec_scale_f32
-	for( i = p + thread; i < pEnd; i += 64 )
-	{
-		float f = result[ i ];
-		f *= scale;
-		result[ i ] = f;
-	}
-}
-\ No newline at end of file
+#define THREADS 64
+#include "softMax.hlsl"
+\ No newline at end of file
diff --git a/ComputeShaders/softMaxLong.hlsl b/ComputeShaders/softMaxLong.hlsl
new file mode 100644
index 0000000..1f2c2be
--- /dev/null
+++ b/ComputeShaders/softMaxLong.hlsl
@@ -0,0 +1,6 @@
+// This version is for the "dec.probs" shader tag
+// The input tensor has a size [ 51865, 3 ], a very long tensor with just 3 rows.
+// Despite the shader only runs on 3 GPU cores, large count of threads helps substantially, this shader is about 50% faster.
+#define THREADS 1024
+
+#include "softMax.hlsl"
+\ No newline at end of file
author	Konstantin <const@const.me>	2023-01-23 14:38:12 +0100
committer	Konstantin <const@const.me>	2023-01-23 14:38:12 +0100
commit	27dfc3428a7016e2d05dd67b6d8b88c0b982baa9 (patch)
tree	f969d54ebfb266ecf61285a039295a1da37200a0 /ComputeShaders
parent	01aba39f15a03ed96e034ffc3b6ee9ec12294b0d (diff)