summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-24 00:38:05 +0100
committerKonstantin <const@const.me>2023-01-24 00:38:05 +0100
commitf575cf6987dd18c2bc45613dd002a71d438a1640 (patch)
treeee6bf58c6d4df8c78235067e96bf624369fb639d
parent1a52ce8301aa0f93c82cece3e3db0986beb3d41a (diff)
mulMatByRowTiled shader, further performance optimizations
-rw-r--r--ComputeShaders/mulMatByRowTiled.hlsl129
1 files changed, 89 insertions, 40 deletions
diff --git a/ComputeShaders/mulMatByRowTiled.hlsl b/ComputeShaders/mulMatByRowTiled.hlsl
index 11c7c18..98ba59a 100644
--- a/ComputeShaders/mulMatByRowTiled.hlsl
+++ b/ComputeShaders/mulMatByRowTiled.hlsl
@@ -1,5 +1,7 @@
// Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ]
// Dispatch [ ( E1 + TILE_Y - 1 ) / TILE_Y, E2, E3 ] thread groups of this shader
+// This one here is the second most expensive shader in the model, after matrix*matrix product.
+// Optimized heavily, as a result the readability ain't great.
#ifndef TILE_Y
static const uint TILE_Y = 64;
@@ -25,22 +27,29 @@ cbuffer Constants: register( b0 )
uint4 resultStrides: packoffset( c5 );
}
-groupshared float resTemp[ TILE_Y ][ THREADS_X ];
-
inline uint hadd( uint2 vec )
{
return vec.x + vec.y;
}
-[ numthreads( THREADS_X, THREADS_Y, 1 ) ]
+// Count of FP32 accumulators we need in every thread of the shader
+static const uint heightScalars = TILE_Y / THREADS_Y;
+// The local accumulators are float4 vectors, compute count of these vectors
+static const uint heightVectors = ( heightScalars + 3 ) / 4;
+
+groupshared float4 reductionBuffer[ heightVectors ][ THREADS_Y ][ THREADS_X ];
+
+[numthreads( THREADS_X, THREADS_Y, 1 )]
void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint threadFlattenned : SV_GroupIndex )
{
uint i;
- // Zero out the shared buffer
- for( i = thread.y; i < TILE_Y; i += THREADS_Y )
- resTemp[ i ][ thread.x ] = 0.0;
- // Before the reduction at the end of this shader, each thread only loads/stores the [ thread.y + THREADS_Y * N ][ thread.x ] elements of the shared buffer,
- // where N is an integer. That's why until the end, we don't need these thread sync instructions.
+ // Despite inside GPU cores, the shared memory is still much slower than registers
+ // For this reason, this shader accumulates numbers in local variables. Only uses groupshared buffer for the final reduction.
+ float4 acc[ heightVectors ];
+ // Zero out the accumulators
+ [unroll]
+ for( i = 0; i < heightVectors; i++ )
+ acc[ i ] = 0.0;
// Count of rows to compute in this thread group
const uint height = min( TILE_Y, arg0Size.y - group.x * TILE_Y );
@@ -55,67 +64,107 @@ void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint thread
const uint completeTiles = arg0Size.x / THREADS_X;
// Each iteration of that loop loads THREADS_X elements from arg1,
// a block of [ THREADS_X, height ] elements from arg0,
- // and accumulates these dot products in the shared buffer
+ // and accumulates these dot products in the local variables
for( uint t = 0; t < completeTiles; t++, s0 += THREADS_X * arg0Strides.x, s1 += THREADS_X * arg1Strides.x )
{
// Load THREADS_X elements from arg1
const float v1 = arg1[ s1 ];
uint rsi = s0;
- for( i = thread.y; i < height; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y )
+ [unroll]
+ for( i = 0; i < heightVectors; i++ )
{
- // Load THREADS_X elements from arg0
- const float v0 = arg0[ rsi ];
- // Multiply and accumulate in the shared buffer
- float acc = resTemp[ i ][ thread.x ];
- acc = mad( v0, v1, acc );
- resTemp[ i ][ thread.x ] = acc;
+ float4 v0 = 0.0;
+ // Load up to 4*THREADS_X elements from arg0
+ [unroll]
+ for( uint j = 0; j < 4; j++, rsi += arg0Strides.y * THREADS_Y )
+ {
+ const uint y = ( i * 4 + j ) * THREADS_Y + thread.y;
+ [branch]
+ if( y < height )
+ v0[ j ] = arg0[ rsi ];
+ }
+ // Multiply + accumulate
+ acc[ i ] = mad( v0, v1, acc[ i ] );
}
}
const uint rem = arg0Size.x % THREADS_X;
- if( rem != 0 )
+ if( thread.x < rem )
{
// E0 ain't a multiple of THREADS_X, we have a remainder
- float v1;
- if( thread.x < rem )
- v1 = arg1[ s1 ];
- else
- v1 = 0.0;
- for( i = thread.y; i < height; i += THREADS_Y, s0 += arg0Strides.y * THREADS_Y )
+ // Load `rem` elements from arg1
+ const float v1 = arg1[ s1 ];
+
+ [unroll]
+ for( i = 0; i < heightVectors; i++ )
{
- if( thread.x >= rem )
- continue;
- const float v0 = arg0[ s0 ];
- float acc = resTemp[ i ][ thread.x ];
- acc = mad( v0, v1, acc );
- resTemp[ i ][ thread.x ] = acc;
+ float4 v0 = 0.0;
+ // Load up to 4*rem elements from arg0
+ [unroll]
+ for( uint j = 0; j < 4; j++, s0 += arg0Strides.y * THREADS_Y )
+ {
+ const uint y = ( i * 4 + j ) * THREADS_Y + thread.y;
+ [branch]
+ if( y < height )
+ v0[ j ] = arg0[ s0 ];
+ }
+ // Multiply + accumulate
+ acc[ i ] = mad( v0, v1, acc[ i ] );
}
}
- // Now we need horizontal sum of these shared accumulators, reducing [height][THREADS_X] shared array into [height][1] column
+ // Now we need horizontal sum of these accumulators, reducing [height][THREADS_X] of them into [height][1] column
+ // First, store local variables into the shared memory.
+ [ unroll ]
+ for( i = 0; i < heightVectors; i++ )
+ reductionBuffer[ i ][ thread.y ][ thread.x ] = acc[ i ];
GroupMemoryBarrierWithGroupSync();
- for( i = THREADS_X / 2; i > 0; i /= 2 )
+ // Run reduction using that shared memory buffer
+ for( i = THREADS_X / 2; i > 1; i /= 2 )
{
if( thread.x < i )
{
- for( uint j = thread.y; j < height; j += THREADS_Y )
+ [unroll]
+ for( uint iv = 0; iv < heightVectors; iv++ )
{
- float sum = resTemp[ j ][ thread.x ];
- sum += resTemp[ j ][ thread.x + i ];
- resTemp[ j ][ thread.x ] = sum;
+ float4 that = reductionBuffer[ iv ][ thread.y ][ thread.x + i ];
+ float4 tmp = acc[ iv ];
+ tmp += that;
+ reductionBuffer[ iv ][ thread.y ][ thread.x ] = tmp;
+ acc[ iv ] = tmp;
}
}
GroupMemoryBarrierWithGroupSync();
}
- // And finally, store that column to global memory
- if( threadFlattenned >= height )
+ // And finally, store that column to global memory.
+ // Only running that code on the threads of the group with thread.x = 0, to save a few loads from the groupshared buffer
+ // This allows to use registers instead, faster to access
+ if( thread.x != 0 )
return;
- uint rdi = hadd( group.yz * resultStrides.zw ) + group.x * TILE_Y * resultStrides.x;
- rdi += threadFlattenned * resultStrides.x;
- result[ rdi ] = resTemp[ threadFlattenned ][ 0 ];
+ uint rdi = hadd( group.yz * resultStrides.zw );
+ rdi += ( group.x * TILE_Y + thread.y ) * resultStrides.x;
+ const uint rdiInc = THREADS_Y * resultStrides.x;
+
+ [unroll]
+ for( i = 0; i < heightVectors; i++ )
+ {
+ // The previous loop had "i > 1" continue condition, it didn't complete the last step of the reduction
+ // The following line is doing that last reduction step
+ const float4 resultVec = acc[ i ] + reductionBuffer[ i ][ thread.y ][ 1 ];
+
+ // Conditionally store these 4 floats to the output tensor
+ [unroll]
+ for( uint j = 0; j < 4; j++, rdi += rdiInc )
+ {
+ const uint y = ( i * 4 + j ) * THREADS_Y + thread.y;
+ [branch]
+ if( y < height )
+ result[ rdi ] = resultVec[ j ];
+ }
+ }
} \ No newline at end of file