GPU performance, matrix multiplication shaders

author: Konstantin <const@const.me> 2023-01-24 16:23:50 +0100
committer: Konstantin <const@const.me> 2023-01-24 16:23:50 +0100
commit: 9d6fb00973218ebc397e15819be3964423b81f91 (patch)
tree: 5337753bb204e827ae8d100cdb648ae61ee7889d
parent: 43e91802254a9b649925fa66d2e15808943e5aaa (diff)
2 files changed, 130 insertions, 129 deletions
diff --git a/ComputeShaders/mulMatByRowTiledEx.hlsl b/ComputeShaders/mulMatByRowTiledEx.hlsl
index d377b8c..bc0c0c6 100644
--- a/ComputeShaders/mulMatByRowTiledEx.hlsl
+++ b/ComputeShaders/mulMatByRowTiledEx.hlsl
@@ -2,9 +2,6 @@
 #ifndef TILE_SIZE
 static const uint TILE_SIZE = 32;
 #endif
-#ifndef TILE_HEIGHT
-static const uint TILE_HEIGHT = 32;
-#endif
 #ifndef THREADS_Y
 static const uint THREADS_Y = 16;
 #endif
@@ -27,130 +24,100 @@ cbuffer Constants: register( b0 )
 	uint4 resultStrides: packoffset( c5 );
 }
 
-groupshared float tileOutput[ THREADS_Y ][ TILE_SIZE ];
-groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
-groupshared float tile1[ TILE_HEIGHT ];
-
-void multiplyTiles( const uint3 thread )
+inline uint hadd4( const uint4 v )
 {
-	float r = 0.0;
-	for( uint i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
-	{
-		float a = tile0[ i ][ thread.x ];
-		float b = tile1[ i ];
-		r = mad( a, b, r );
-	}
-	tileOutput[ thread.y ][ thread.x ] += r;
+	const uint2 v2 = v.xy + v.zw;
+	return v2.x + v2.y;
 }
 
-void reduceOutput( const uint3 thread )
+inline float hadd4( const float4 v )
 {
-	float curr = 0.0;
-	[branch]
-	if( thread.y < THREADS_Y / 2 )
-		curr = tileOutput[ thread.y ][ thread.x ];
-
-	for( uint i = THREADS_Y / 2; i > 0; i /= 2 )
-	{
-		[branch]
-		if( thread.y < i )
-		{
-			curr += tileOutput[ thread.y + i ][ thread.x ];
-			tileOutput[ thread.y ][ thread.x ] = curr;
-		}
-		GroupMemoryBarrierWithGroupSync();
-	}
+	const float2 v2 = v.xy + v.zw;
+	return v2.x + v2.y;
 }
 
-void storeTile( const uint threadFlat, const uint4 pos, const uint size )
-{
-	if( threadFlat >= size )
-		return;
-	const uint4 prod4 = pos * resultStrides;
-	const uint2 prod2 = prod4.xy + prod4.zw;
-	uint rdi = prod2.x + prod2.y;
-	result[ rdi + threadFlat ] = tileOutput[ 0 ][ threadFlat ];
-}
+groupshared float reductionBuffer[ THREADS_Y ][ TILE_SIZE ];
 
-[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ]
-void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID, uint threadFlat : SV_GroupIndex )
+[numthreads( TILE_SIZE, THREADS_Y, 1 )]
+void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID )
 {
-	uint i;
-	// Zero all 3 shared buffers
-	tileOutput[ thread.y ][ thread.x ] = 0.0;
-	for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
-		tile0[ i ][ thread.x ] = 0.0;
-	if( threadFlat < THREADS_Y )
-		tile1[ threadFlat ] = 0.0;
-
 	const uint2 layer = group.yz;
-	uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
-	uint rsi1 = layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+	// Source offsets for the complete thread group
+	uint2 rsi;
+	rsi.x = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
+	rsi.y = layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+	// Apply source offsets for this particular thread
+	rsi.x += thread.y * TILE_SIZE + thread.x;
+	rsi.y += thread.y * arg1Strides.x;
 
-	const uint threadOffset = thread.y * TILE_SIZE + thread.x;
-	rsi0 += threadOffset;
-	rsi1 += threadFlat * arg1Strides.x;
+	const uint2 rsiInc = uint2( THREADS_Y * TILE_SIZE, THREADS_Y * arg1Strides.x );
 
-	const uint completeTiles = arg0Size.x / TILE_HEIGHT;
+	const uint completeTiles = arg0Size.x / ( THREADS_Y * 4 );
+	uint i;
+	float4 acc = 0.0;
 	for( i = 0; i < completeTiles; i++ )
 	{
-		// Load [ TILE_SIZE, TILE_HEIGHT ] block from the first source tensor into the groupshared buffer
-		for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+		// Each iteration of this loop consumes THREADS_Y*4 columns from the arg0 panel, and THREADS_Y*4 values from arg1
+		float4 v0, v1;
+		[unroll]
+		for( uint j = 0; j < 4; j++, rsi += rsiInc )
 		{
-			tile0[ j ][ thread.x ] = arg0[ rsi0 ];
-			rsi0 += THREADS_Y * TILE_SIZE;
+			// Load [ TILE_SIZE, THREADS_Y ] block from the first source tensor
+			v0[ j ] = arg0[ rsi.x ];
+			// Broadcast [ THREADS_Y ] row from the second source tensor
+			v1[ j ] = arg1[ rsi.y ];
 		}
-		// Load [ TILE_HEIGHT ] row from the second source into another groupshared buffer
-		[ branch ]
-		if( threadFlat < TILE_HEIGHT )
-			tile1[ threadFlat ] = arg1[ rsi1 ];
-		rsi1 += TILE_HEIGHT * arg1Strides.x;
-
-		GroupMemoryBarrierWithGroupSync();
-
-		multiplyTiles( thread );
 
-		GroupMemoryBarrierWithGroupSync();
+		// Now we have [ TILE_SIZE, THREADS_Y * 4 ] block from the first source tensor in the v0 vector,
+		// and [ THREADS_Y * 4 ] row from the second one in the v1 vector
+		// Multiply and accumulate.
+		acc = mad( v0, v1, acc );
 	}
 
-	const uint rem = arg0Size.x % TILE_HEIGHT;
+	// Handle the remainder columns, if any.
+	// When present, their count is in [ 1 .. THREADS_Y * 4 - 1 ] interval
+	const uint rem = arg0Size.x % ( THREADS_Y * 4 );
 	if( rem != 0 )
 	{
-		for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+		float4 v0 = 0.0, v1 = 0.0;
+		[unroll]
+		for( uint j = 0; j < 4; j++, rsi += rsiInc )
 		{
-			float a;
-			[branch]
-			if( j < rem )
+			const uint x = ( j * THREADS_Y ) + thread.y;
+			if( x < rem )
 			{
-				a = arg0[ rsi0 ];
-				rsi0 += THREADS_Y * TILE_SIZE;
+				v0[ j ] = arg0[ rsi.x ];
+				v1[ j ] = arg1[ rsi.y ];
 			}
-			else
-				a = 0.0;
-			tile0[ j ][ thread.x ] = a;
 		}
+		acc = mad( v0, v1, acc );
+	}
 
-		if( threadFlat < TILE_HEIGHT )
+	// We now have [ TILE_SIZE, THREADS_Y * 4 ] block in the local variables of this thread group
+	// The group however only outputs [ TILE_SIZE ] elements max, need a reduction
+	float acc1 = hadd4( acc );
+	reductionBuffer[ thread.y ][ thread.x ] = acc1;
+	GroupMemoryBarrierWithGroupSync();
+
+	for( i = THREADS_Y / 2; i > 1; i /= 2 )
+	{
+		if( thread.y < i )
 		{
-			float b;
-			[branch]
-			if( threadFlat < rem )
-				b = arg1[ rsi1 ];
-			else
-				b = 0.0;
-			tile1[ threadFlat ] = b;
+			acc1 += reductionBuffer[ thread.y + i ][ thread.x ];
+			reductionBuffer[ thread.y ][ thread.x ] = acc1;
 		}
-
-		GroupMemoryBarrierWithGroupSync();
-
-		multiplyTiles( thread );
-
 		GroupMemoryBarrierWithGroupSync();
 	}
 
-	reduceOutput( thread );
+	if( thread.y != 0 )
+		return;
 
 	const uint resultPos = group.x * TILE_SIZE;
 	const uint outputSize = min( TILE_SIZE, resultSize.x - resultPos );
-	storeTile( threadFlat, uint4( resultPos, 0, layer ), outputSize );
+	if( thread.x >= outputSize )
+		return;
+
+	const uint4 resultPos4 = uint4( resultPos + thread.x, 0, layer );
+	const uint rdi = hadd4( resultPos4 * resultStrides );
+	result[ rdi ] = acc1 + reductionBuffer[ 1 ][ thread.x ];
 }
 \ No newline at end of file
diff --git a/ComputeShaders/mulMatTiledEx.hlsl b/ComputeShaders/mulMatTiledEx.hlsl
index 0f23da2..e6d6940 100644
--- a/ComputeShaders/mulMatTiledEx.hlsl
+++ b/ComputeShaders/mulMatTiledEx.hlsl
@@ -9,8 +9,9 @@ static const uint TILE_SIZE = 32;
 static const uint TILE_HEIGHT = 32;
 #endif
 #ifndef THREADS_Y
-static const uint THREADS_Y = 16;
+static const uint THREADS_Y = 8;
 #endif
+// The above values have a following constraint: TILE_SIZE = THREADS_Y * N * 4 where N is an integer
 
 #ifndef STREAM_SECOND_MATRIX
 #define STREAM_SECOND_MATRIX 1
@@ -37,9 +38,6 @@ cbuffer Constants: register( b0 )
 	uint4 resultStrides: packoffset( c5 );
 }
 
-// Accumulator for the output tile
-// That last `+1` helps a bit, I'm not sure why exactly but probebly because memory bank conflicts.
-groupshared float tileOutput[ TILE_SIZE ][ TILE_SIZE + 1 ];
 // A smaller tile loaded from the first source matrix
 groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
 #if !STREAM_SECOND_MATRIX
@@ -47,62 +45,91 @@ groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
 groupshared float tile1[ TILE_HEIGHT ][ TILE_SIZE ];
 #endif
 
+// Count of FP32 accumulators we need in every thread of the shader
+static const uint heightScalars = TILE_SIZE / THREADS_Y;
+// The local accumulators are float4 vectors, compute count of these vectors
+static const uint heightVectors = ( heightScalars + 3 ) / 4;
+
 #if STREAM_SECOND_MATRIX
-void multiplyTiles( const uint3 thread, uint rsi, const uint h )
+void multiplyTiles( const uint3 thread, uint rsi, const uint h, inout float4 acc[ heightVectors ] )
 {
-	uint2 i = uint2( thread.y, rsi );
-	const uint2 iInc = uint2( THREADS_Y, THREADS_Y );
-	for( ; i.x < TILE_SIZE; i += iInc )
+	uint4 rsi4 = rsi + uint4( 0, THREADS_Y, THREADS_Y * 2, THREADS_Y * 3 );
+	[unroll]
+	for( uint iv = 0; iv < heightVectors; iv++, rsi4 += THREADS_Y * 4 )
 	{
-		float r = 0.0;
-		uint2 j = uint2( 0, i.y );
-		const uint2 jInc = uint2( 1, TILE_SIZE );
-		for( ; j.x < h; j += jInc )
+		float4 r = 0.0;
+		uint4 rsiRow = rsi4;
+		for( uint j = 0; j < h; j++, rsiRow += TILE_SIZE )
 		{
-			float a = tile0[ j.x ][ thread.x ];
-			float b = arg1[ j.y ];
+			const float a = tile0[ j ][ thread.x ];
+			float4 b = 0.0;
+			[unroll]
+			for( uint k = 0; k < 4; k++ )
+			{
+				b[ k ] = arg1[ rsiRow[ k ] ];
+			}
 			r = mad( a, b, r );
 		}
-		tileOutput[ i.x ][ thread.x ] += r;
+		acc[ iv ] += r;
 	}
 }
 #else
-void multiplyTiles( const uint3 thread )
+void multiplyTiles( const uint3 thread, inout float4 acc[ heightVectors ] )
 {
-	for( uint row = thread.y; row < TILE_SIZE; row += THREADS_Y )
+	[unroll]
+	for( uint i = 0; i < heightVectors; i++ )
 	{
-		float r = 0.0;
+		float4 r = 0.0;
 		for( uint j = 0; j < TILE_HEIGHT; j++ )
 		{
-			float a = tile0[ j ][ thread.x ];
-			float b = tile1[ j ][ row ];
+			const float a = tile0[ j ][ thread.x ];
+			float4 b;
+			[unroll]
+			for( uint k = 0; k < 4; k++ )
+			{
+				const uint row = ( i * 4 + k ) * THREADS_Y + thread.y;
+				b[ k ] = tile1[ j ][ row ];
+			}
 			r = mad( a, b, r );
 		}
-		tileOutput[ row ][ thread.x ] += r;
+		acc[ i ] += r;
 	}
 }
 #endif
 
-void storeTile( const uint3 thread, const uint4 pos, const uint2 size )
+void storeTile( const uint3 thread, const uint4 pos, const uint2 size, in float4 acc[ heightVectors ] )
 {
 	if( thread.x >= size.x )
 		return;
+
 	const uint4 prod4 = pos * resultStrides;
 	const uint2 prod2 = prod4.xy + prod4.zw;
 	uint rdi = prod2.x + prod2.y;
 	rdi += resultStrides.y * thread.y;
-	rdi += thread.x;
-	for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y )
-		result[ rdi ] = tileOutput[ i ][ thread.x ];
+	rdi += resultStrides.x * thread.x;
+
+	const uint4 offsets = THREADS_Y * uint4( 0, 1, 2, 3 );	//< a compile-time constant vector
+	uint4 rdi4 = resultStrides.y * offsets + rdi;
+
+	[unroll]
+	for( uint iv = 0; iv < heightVectors; iv++, rdi4 += resultStrides.y * THREADS_Y * 4 )
+	{
+		const float4 source = acc[ iv ];
+		[unroll]
+		for( uint k = 0; k < 4; k++ )
+		{
+			const uint i = ( iv * 4 + k ) * THREADS_Y + thread.y;
+			if( i < size.y )
+				result[ rdi4[ k ] ] = source[ k ];
+		}
+	}
 }
 
 [numthreads( TILE_SIZE, THREADS_Y, 1 )]
 void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID )
 {
 	uint i;
-	// Zero all 3 shared buffers
-	for( i = thread.y; i < TILE_SIZE; i += THREADS_Y )
-		tileOutput[ i ][ thread.x ] = 0.0;
+	// Zero all shared buffers
 	for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
 	{
 		tile0[ i ][ thread.x ] = 0.0;
@@ -110,6 +137,13 @@ void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID
 		tile1[ i ][ thread.x ] = 0.0;
 #endif
 	}
+	// Despite inside GPU cores, the shared memory is still much slower than registers
+	// For this reason, this shader accumulates numbers in local variables. Only uses groupshared memory for tiles of the argument matrices.
+	float4 acc[ heightVectors ];
+	// Zero out the accumulators
+	[unroll]
+	for( i = 0; i < heightVectors; i++ )
+		acc[ i ] = 0.0;
 
 	const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z );
 
@@ -142,11 +176,11 @@ void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID
 		GroupMemoryBarrierWithGroupSync();
 
 #if STREAM_SECOND_MATRIX
-		multiplyTiles( thread, rsi1, TILE_HEIGHT );
+		multiplyTiles( thread, rsi1, TILE_HEIGHT, acc );
 		rsi1 += TILE_HEIGHT * TILE_SIZE;
 #else
 		// Multiply + accumulate the elements collected in the groupshared buffers
-		multiplyTiles( thread );
+		multiplyTiles( thread, acc );
 #endif
 		GroupMemoryBarrierWithGroupSync();
 	}
@@ -181,14 +215,14 @@ void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID
 
 		// Multiply + accumulate the elements collected in the groupshared buffers
 #if STREAM_SECOND_MATRIX
-		multiplyTiles( thread, rsi1, rem );
+		multiplyTiles( thread, rsi1, rem, acc );
 #else
-		multiplyTiles( thread );
+		multiplyTiles( thread, acc );
 #endif
 		GroupMemoryBarrierWithGroupSync();
 	}
 
 	const uint2 resultPos = group.xy * TILE_SIZE;
 	const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos );
-	storeTile( thread, uint4( resultPos, layer ), outputSize );
+	storeTile( thread, uint4( resultPos, layer ), outputSize, acc );
 }
 \ No newline at end of file
author	Konstantin <const@const.me>	2023-01-24 16:23:50 +0100
committer	Konstantin <const@const.me>	2023-01-24 16:23:50 +0100
commit	9d6fb00973218ebc397e15819be3964423b81f91 (patch)
tree	5337753bb204e827ae8d100cdb648ae61ee7889d
parent	43e91802254a9b649925fa66d2e15808943e5aaa (diff)