diff options
| author | Konstantin <const@const.me> | 2023-01-24 16:23:50 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-24 16:23:50 +0100 |
| commit | 9d6fb00973218ebc397e15819be3964423b81f91 (patch) | |
| tree | 5337753bb204e827ae8d100cdb648ae61ee7889d | |
| parent | 43e91802254a9b649925fa66d2e15808943e5aaa (diff) | |
GPU performance, matrix multiplication shaders
| -rw-r--r-- | ComputeShaders/mulMatByRowTiledEx.hlsl | 159 | ||||
| -rw-r--r-- | ComputeShaders/mulMatTiledEx.hlsl | 100 |
2 files changed, 130 insertions, 129 deletions
diff --git a/ComputeShaders/mulMatByRowTiledEx.hlsl b/ComputeShaders/mulMatByRowTiledEx.hlsl index d377b8c..bc0c0c6 100644 --- a/ComputeShaders/mulMatByRowTiledEx.hlsl +++ b/ComputeShaders/mulMatByRowTiledEx.hlsl @@ -2,9 +2,6 @@ #ifndef TILE_SIZE static const uint TILE_SIZE = 32; #endif -#ifndef TILE_HEIGHT -static const uint TILE_HEIGHT = 32; -#endif #ifndef THREADS_Y static const uint THREADS_Y = 16; #endif @@ -27,130 +24,100 @@ cbuffer Constants: register( b0 ) uint4 resultStrides: packoffset( c5 ); } -groupshared float tileOutput[ THREADS_Y ][ TILE_SIZE ]; -groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ]; -groupshared float tile1[ TILE_HEIGHT ]; - -void multiplyTiles( const uint3 thread ) +inline uint hadd4( const uint4 v ) { - float r = 0.0; - for( uint i = thread.y; i < TILE_HEIGHT; i += THREADS_Y ) - { - float a = tile0[ i ][ thread.x ]; - float b = tile1[ i ]; - r = mad( a, b, r ); - } - tileOutput[ thread.y ][ thread.x ] += r; + const uint2 v2 = v.xy + v.zw; + return v2.x + v2.y; } -void reduceOutput( const uint3 thread ) +inline float hadd4( const float4 v ) { - float curr = 0.0; - [branch] - if( thread.y < THREADS_Y / 2 ) - curr = tileOutput[ thread.y ][ thread.x ]; - - for( uint i = THREADS_Y / 2; i > 0; i /= 2 ) - { - [branch] - if( thread.y < i ) - { - curr += tileOutput[ thread.y + i ][ thread.x ]; - tileOutput[ thread.y ][ thread.x ] = curr; - } - GroupMemoryBarrierWithGroupSync(); - } + const float2 v2 = v.xy + v.zw; + return v2.x + v2.y; } -void storeTile( const uint threadFlat, const uint4 pos, const uint size ) -{ - if( threadFlat >= size ) - return; - const uint4 prod4 = pos * resultStrides; - const uint2 prod2 = prod4.xy + prod4.zw; - uint rdi = prod2.x + prod2.y; - result[ rdi + threadFlat ] = tileOutput[ 0 ][ threadFlat ]; -} +groupshared float reductionBuffer[ THREADS_Y ][ TILE_SIZE ]; -[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ] -void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID, uint threadFlat : SV_GroupIndex ) +[numthreads( TILE_SIZE, THREADS_Y, 1 )] +void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID ) { - uint i; - // Zero all 3 shared buffers - tileOutput[ thread.y ][ thread.x ] = 0.0; - for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y ) - tile0[ i ][ thread.x ] = 0.0; - if( threadFlat < THREADS_Y ) - tile1[ threadFlat ] = 0.0; - const uint2 layer = group.yz; - uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y; - uint rsi1 = layer.x * arg1Strides.z + layer.y * arg1Strides.w; + // Source offsets for the complete thread group + uint2 rsi; + rsi.x = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y; + rsi.y = layer.x * arg1Strides.z + layer.y * arg1Strides.w; + // Apply source offsets for this particular thread + rsi.x += thread.y * TILE_SIZE + thread.x; + rsi.y += thread.y * arg1Strides.x; - const uint threadOffset = thread.y * TILE_SIZE + thread.x; - rsi0 += threadOffset; - rsi1 += threadFlat * arg1Strides.x; + const uint2 rsiInc = uint2( THREADS_Y * TILE_SIZE, THREADS_Y * arg1Strides.x ); - const uint completeTiles = arg0Size.x / TILE_HEIGHT; + const uint completeTiles = arg0Size.x / ( THREADS_Y * 4 ); + uint i; + float4 acc = 0.0; for( i = 0; i < completeTiles; i++ ) { - // Load [ TILE_SIZE, TILE_HEIGHT ] block from the first source tensor into the groupshared buffer - for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y ) + // Each iteration of this loop consumes THREADS_Y*4 columns from the arg0 panel, and THREADS_Y*4 values from arg1 + float4 v0, v1; + [unroll] + for( uint j = 0; j < 4; j++, rsi += rsiInc ) { - tile0[ j ][ thread.x ] = arg0[ rsi0 ]; - rsi0 += THREADS_Y * TILE_SIZE; + // Load [ TILE_SIZE, THREADS_Y ] block from the first source tensor + v0[ j ] = arg0[ rsi.x ]; + // Broadcast [ THREADS_Y ] row from the second source tensor + v1[ j ] = arg1[ rsi.y ]; } - // Load [ TILE_HEIGHT ] row from the second source into another groupshared buffer - [ branch ] - if( threadFlat < TILE_HEIGHT ) - tile1[ threadFlat ] = arg1[ rsi1 ]; - rsi1 += TILE_HEIGHT * arg1Strides.x; - - GroupMemoryBarrierWithGroupSync(); - - multiplyTiles( thread ); - GroupMemoryBarrierWithGroupSync(); + // Now we have [ TILE_SIZE, THREADS_Y * 4 ] block from the first source tensor in the v0 vector, + // and [ THREADS_Y * 4 ] row from the second one in the v1 vector + // Multiply and accumulate. + acc = mad( v0, v1, acc ); } - const uint rem = arg0Size.x % TILE_HEIGHT; + // Handle the remainder columns, if any. + // When present, their count is in [ 1 .. THREADS_Y * 4 - 1 ] interval + const uint rem = arg0Size.x % ( THREADS_Y * 4 ); if( rem != 0 ) { - for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y ) + float4 v0 = 0.0, v1 = 0.0; + [unroll] + for( uint j = 0; j < 4; j++, rsi += rsiInc ) { - float a; - [branch] - if( j < rem ) + const uint x = ( j * THREADS_Y ) + thread.y; + if( x < rem ) { - a = arg0[ rsi0 ]; - rsi0 += THREADS_Y * TILE_SIZE; + v0[ j ] = arg0[ rsi.x ]; + v1[ j ] = arg1[ rsi.y ]; } - else - a = 0.0; - tile0[ j ][ thread.x ] = a; } + acc = mad( v0, v1, acc ); + } - if( threadFlat < TILE_HEIGHT ) + // We now have [ TILE_SIZE, THREADS_Y * 4 ] block in the local variables of this thread group + // The group however only outputs [ TILE_SIZE ] elements max, need a reduction + float acc1 = hadd4( acc ); + reductionBuffer[ thread.y ][ thread.x ] = acc1; + GroupMemoryBarrierWithGroupSync(); + + for( i = THREADS_Y / 2; i > 1; i /= 2 ) + { + if( thread.y < i ) { - float b; - [branch] - if( threadFlat < rem ) - b = arg1[ rsi1 ]; - else - b = 0.0; - tile1[ threadFlat ] = b; + acc1 += reductionBuffer[ thread.y + i ][ thread.x ]; + reductionBuffer[ thread.y ][ thread.x ] = acc1; } - - GroupMemoryBarrierWithGroupSync(); - - multiplyTiles( thread ); - GroupMemoryBarrierWithGroupSync(); } - reduceOutput( thread ); + if( thread.y != 0 ) + return; const uint resultPos = group.x * TILE_SIZE; const uint outputSize = min( TILE_SIZE, resultSize.x - resultPos ); - storeTile( threadFlat, uint4( resultPos, 0, layer ), outputSize ); + if( thread.x >= outputSize ) + return; + + const uint4 resultPos4 = uint4( resultPos + thread.x, 0, layer ); + const uint rdi = hadd4( resultPos4 * resultStrides ); + result[ rdi ] = acc1 + reductionBuffer[ 1 ][ thread.x ]; }
\ No newline at end of file diff --git a/ComputeShaders/mulMatTiledEx.hlsl b/ComputeShaders/mulMatTiledEx.hlsl index 0f23da2..e6d6940 100644 --- a/ComputeShaders/mulMatTiledEx.hlsl +++ b/ComputeShaders/mulMatTiledEx.hlsl @@ -9,8 +9,9 @@ static const uint TILE_SIZE = 32; static const uint TILE_HEIGHT = 32; #endif #ifndef THREADS_Y -static const uint THREADS_Y = 16; +static const uint THREADS_Y = 8; #endif +// The above values have a following constraint: TILE_SIZE = THREADS_Y * N * 4 where N is an integer #ifndef STREAM_SECOND_MATRIX #define STREAM_SECOND_MATRIX 1 @@ -37,9 +38,6 @@ cbuffer Constants: register( b0 ) uint4 resultStrides: packoffset( c5 ); } -// Accumulator for the output tile -// That last `+1` helps a bit, I'm not sure why exactly but probebly because memory bank conflicts. -groupshared float tileOutput[ TILE_SIZE ][ TILE_SIZE + 1 ]; // A smaller tile loaded from the first source matrix groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ]; #if !STREAM_SECOND_MATRIX @@ -47,62 +45,91 @@ groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ]; groupshared float tile1[ TILE_HEIGHT ][ TILE_SIZE ]; #endif +// Count of FP32 accumulators we need in every thread of the shader +static const uint heightScalars = TILE_SIZE / THREADS_Y; +// The local accumulators are float4 vectors, compute count of these vectors +static const uint heightVectors = ( heightScalars + 3 ) / 4; + #if STREAM_SECOND_MATRIX -void multiplyTiles( const uint3 thread, uint rsi, const uint h ) +void multiplyTiles( const uint3 thread, uint rsi, const uint h, inout float4 acc[ heightVectors ] ) { - uint2 i = uint2( thread.y, rsi ); - const uint2 iInc = uint2( THREADS_Y, THREADS_Y ); - for( ; i.x < TILE_SIZE; i += iInc ) + uint4 rsi4 = rsi + uint4( 0, THREADS_Y, THREADS_Y * 2, THREADS_Y * 3 ); + [unroll] + for( uint iv = 0; iv < heightVectors; iv++, rsi4 += THREADS_Y * 4 ) { - float r = 0.0; - uint2 j = uint2( 0, i.y ); - const uint2 jInc = uint2( 1, TILE_SIZE ); - for( ; j.x < h; j += jInc ) + float4 r = 0.0; + uint4 rsiRow = rsi4; + for( uint j = 0; j < h; j++, rsiRow += TILE_SIZE ) { - float a = tile0[ j.x ][ thread.x ]; - float b = arg1[ j.y ]; + const float a = tile0[ j ][ thread.x ]; + float4 b = 0.0; + [unroll] + for( uint k = 0; k < 4; k++ ) + { + b[ k ] = arg1[ rsiRow[ k ] ]; + } r = mad( a, b, r ); } - tileOutput[ i.x ][ thread.x ] += r; + acc[ iv ] += r; } } #else -void multiplyTiles( const uint3 thread ) +void multiplyTiles( const uint3 thread, inout float4 acc[ heightVectors ] ) { - for( uint row = thread.y; row < TILE_SIZE; row += THREADS_Y ) + [unroll] + for( uint i = 0; i < heightVectors; i++ ) { - float r = 0.0; + float4 r = 0.0; for( uint j = 0; j < TILE_HEIGHT; j++ ) { - float a = tile0[ j ][ thread.x ]; - float b = tile1[ j ][ row ]; + const float a = tile0[ j ][ thread.x ]; + float4 b; + [unroll] + for( uint k = 0; k < 4; k++ ) + { + const uint row = ( i * 4 + k ) * THREADS_Y + thread.y; + b[ k ] = tile1[ j ][ row ]; + } r = mad( a, b, r ); } - tileOutput[ row ][ thread.x ] += r; + acc[ i ] += r; } } #endif -void storeTile( const uint3 thread, const uint4 pos, const uint2 size ) +void storeTile( const uint3 thread, const uint4 pos, const uint2 size, in float4 acc[ heightVectors ] ) { if( thread.x >= size.x ) return; + const uint4 prod4 = pos * resultStrides; const uint2 prod2 = prod4.xy + prod4.zw; uint rdi = prod2.x + prod2.y; rdi += resultStrides.y * thread.y; - rdi += thread.x; - for( uint i = thread.y; i < size.y; i += THREADS_Y, rdi += resultStrides.y * THREADS_Y ) - result[ rdi ] = tileOutput[ i ][ thread.x ]; + rdi += resultStrides.x * thread.x; + + const uint4 offsets = THREADS_Y * uint4( 0, 1, 2, 3 ); //< a compile-time constant vector + uint4 rdi4 = resultStrides.y * offsets + rdi; + + [unroll] + for( uint iv = 0; iv < heightVectors; iv++, rdi4 += resultStrides.y * THREADS_Y * 4 ) + { + const float4 source = acc[ iv ]; + [unroll] + for( uint k = 0; k < 4; k++ ) + { + const uint i = ( iv * 4 + k ) * THREADS_Y + thread.y; + if( i < size.y ) + result[ rdi4[ k ] ] = source[ k ]; + } + } } [numthreads( TILE_SIZE, THREADS_Y, 1 )] void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID ) { uint i; - // Zero all 3 shared buffers - for( i = thread.y; i < TILE_SIZE; i += THREADS_Y ) - tileOutput[ i ][ thread.x ] = 0.0; + // Zero all shared buffers for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y ) { tile0[ i ][ thread.x ] = 0.0; @@ -110,6 +137,13 @@ void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID tile1[ i ][ thread.x ] = 0.0; #endif } + // Despite inside GPU cores, the shared memory is still much slower than registers + // For this reason, this shader accumulates numbers in local variables. Only uses groupshared memory for tiles of the argument matrices. + float4 acc[ heightVectors ]; + // Zero out the accumulators + [unroll] + for( i = 0; i < heightVectors; i++ ) + acc[ i ] = 0.0; const uint2 layer = uint2( group.z % resultSize.z, group.z / resultSize.z ); @@ -142,11 +176,11 @@ void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID GroupMemoryBarrierWithGroupSync(); #if STREAM_SECOND_MATRIX - multiplyTiles( thread, rsi1, TILE_HEIGHT ); + multiplyTiles( thread, rsi1, TILE_HEIGHT, acc ); rsi1 += TILE_HEIGHT * TILE_SIZE; #else // Multiply + accumulate the elements collected in the groupshared buffers - multiplyTiles( thread ); + multiplyTiles( thread, acc ); #endif GroupMemoryBarrierWithGroupSync(); } @@ -181,14 +215,14 @@ void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID // Multiply + accumulate the elements collected in the groupshared buffers #if STREAM_SECOND_MATRIX - multiplyTiles( thread, rsi1, rem ); + multiplyTiles( thread, rsi1, rem, acc ); #else - multiplyTiles( thread ); + multiplyTiles( thread, acc ); #endif GroupMemoryBarrierWithGroupSync(); } const uint2 resultPos = group.xy * TILE_SIZE; const uint2 outputSize = min( TILE_SIZE, resultSize.xy - resultPos ); - storeTile( thread, uint4( resultPos, layer ), outputSize ); + storeTile( thread, uint4( resultPos, layer ), outputSize, acc ); }
\ No newline at end of file |
