diff options
| author | Konstantin <const@const.me> | 2023-01-24 00:38:05 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-24 00:38:05 +0100 |
| commit | f575cf6987dd18c2bc45613dd002a71d438a1640 (patch) | |
| tree | ee6bf58c6d4df8c78235067e96bf624369fb639d | |
| parent | 1a52ce8301aa0f93c82cece3e3db0986beb3d41a (diff) | |
mulMatByRowTiled shader, further performance optimizations
| -rw-r--r-- | ComputeShaders/mulMatByRowTiled.hlsl | 129 |
1 files changed, 89 insertions, 40 deletions
diff --git a/ComputeShaders/mulMatByRowTiled.hlsl b/ComputeShaders/mulMatByRowTiled.hlsl index 11c7c18..98ba59a 100644 --- a/ComputeShaders/mulMatByRowTiled.hlsl +++ b/ComputeShaders/mulMatByRowTiled.hlsl @@ -1,5 +1,7 @@ // Matrix * row product, like [ E0, E1, E2, E3 ] * [ E0, 1, E2, E3 ] = [ E1, 1, E2, E3 ] // Dispatch [ ( E1 + TILE_Y - 1 ) / TILE_Y, E2, E3 ] thread groups of this shader +// This one here is the second most expensive shader in the model, after matrix*matrix product. +// Optimized heavily, as a result the readability ain't great. #ifndef TILE_Y static const uint TILE_Y = 64; @@ -25,22 +27,29 @@ cbuffer Constants: register( b0 ) uint4 resultStrides: packoffset( c5 ); } -groupshared float resTemp[ TILE_Y ][ THREADS_X ]; - inline uint hadd( uint2 vec ) { return vec.x + vec.y; } -[ numthreads( THREADS_X, THREADS_Y, 1 ) ] +// Count of FP32 accumulators we need in every thread of the shader +static const uint heightScalars = TILE_Y / THREADS_Y; +// The local accumulators are float4 vectors, compute count of these vectors +static const uint heightVectors = ( heightScalars + 3 ) / 4; + +groupshared float4 reductionBuffer[ heightVectors ][ THREADS_Y ][ THREADS_X ]; + +[numthreads( THREADS_X, THREADS_Y, 1 )] void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint threadFlattenned : SV_GroupIndex ) { uint i; - // Zero out the shared buffer - for( i = thread.y; i < TILE_Y; i += THREADS_Y ) - resTemp[ i ][ thread.x ] = 0.0; - // Before the reduction at the end of this shader, each thread only loads/stores the [ thread.y + THREADS_Y * N ][ thread.x ] elements of the shared buffer, - // where N is an integer. That's why until the end, we don't need these thread sync instructions. + // Despite inside GPU cores, the shared memory is still much slower than registers + // For this reason, this shader accumulates numbers in local variables. Only uses groupshared buffer for the final reduction. + float4 acc[ heightVectors ]; + // Zero out the accumulators + [unroll] + for( i = 0; i < heightVectors; i++ ) + acc[ i ] = 0.0; // Count of rows to compute in this thread group const uint height = min( TILE_Y, arg0Size.y - group.x * TILE_Y ); @@ -55,67 +64,107 @@ void main( uint3 group: SV_GroupID, uint3 thread : SV_GroupThreadID, uint thread const uint completeTiles = arg0Size.x / THREADS_X; // Each iteration of that loop loads THREADS_X elements from arg1, // a block of [ THREADS_X, height ] elements from arg0, - // and accumulates these dot products in the shared buffer + // and accumulates these dot products in the local variables for( uint t = 0; t < completeTiles; t++, s0 += THREADS_X * arg0Strides.x, s1 += THREADS_X * arg1Strides.x ) { // Load THREADS_X elements from arg1 const float v1 = arg1[ s1 ]; uint rsi = s0; - for( i = thread.y; i < height; i += THREADS_Y, rsi += arg0Strides.y * THREADS_Y ) + [unroll] + for( i = 0; i < heightVectors; i++ ) { - // Load THREADS_X elements from arg0 - const float v0 = arg0[ rsi ]; - // Multiply and accumulate in the shared buffer - float acc = resTemp[ i ][ thread.x ]; - acc = mad( v0, v1, acc ); - resTemp[ i ][ thread.x ] = acc; + float4 v0 = 0.0; + // Load up to 4*THREADS_X elements from arg0 + [unroll] + for( uint j = 0; j < 4; j++, rsi += arg0Strides.y * THREADS_Y ) + { + const uint y = ( i * 4 + j ) * THREADS_Y + thread.y; + [branch] + if( y < height ) + v0[ j ] = arg0[ rsi ]; + } + // Multiply + accumulate + acc[ i ] = mad( v0, v1, acc[ i ] ); } } const uint rem = arg0Size.x % THREADS_X; - if( rem != 0 ) + if( thread.x < rem ) { // E0 ain't a multiple of THREADS_X, we have a remainder - float v1; - if( thread.x < rem ) - v1 = arg1[ s1 ]; - else - v1 = 0.0; - for( i = thread.y; i < height; i += THREADS_Y, s0 += arg0Strides.y * THREADS_Y ) + // Load `rem` elements from arg1 + const float v1 = arg1[ s1 ]; + + [unroll] + for( i = 0; i < heightVectors; i++ ) { - if( thread.x >= rem ) - continue; - const float v0 = arg0[ s0 ]; - float acc = resTemp[ i ][ thread.x ]; - acc = mad( v0, v1, acc ); - resTemp[ i ][ thread.x ] = acc; + float4 v0 = 0.0; + // Load up to 4*rem elements from arg0 + [unroll] + for( uint j = 0; j < 4; j++, s0 += arg0Strides.y * THREADS_Y ) + { + const uint y = ( i * 4 + j ) * THREADS_Y + thread.y; + [branch] + if( y < height ) + v0[ j ] = arg0[ s0 ]; + } + // Multiply + accumulate + acc[ i ] = mad( v0, v1, acc[ i ] ); } } - // Now we need horizontal sum of these shared accumulators, reducing [height][THREADS_X] shared array into [height][1] column + // Now we need horizontal sum of these accumulators, reducing [height][THREADS_X] of them into [height][1] column + // First, store local variables into the shared memory. + [ unroll ] + for( i = 0; i < heightVectors; i++ ) + reductionBuffer[ i ][ thread.y ][ thread.x ] = acc[ i ]; GroupMemoryBarrierWithGroupSync(); - for( i = THREADS_X / 2; i > 0; i /= 2 ) + // Run reduction using that shared memory buffer + for( i = THREADS_X / 2; i > 1; i /= 2 ) { if( thread.x < i ) { - for( uint j = thread.y; j < height; j += THREADS_Y ) + [unroll] + for( uint iv = 0; iv < heightVectors; iv++ ) { - float sum = resTemp[ j ][ thread.x ]; - sum += resTemp[ j ][ thread.x + i ]; - resTemp[ j ][ thread.x ] = sum; + float4 that = reductionBuffer[ iv ][ thread.y ][ thread.x + i ]; + float4 tmp = acc[ iv ]; + tmp += that; + reductionBuffer[ iv ][ thread.y ][ thread.x ] = tmp; + acc[ iv ] = tmp; } } GroupMemoryBarrierWithGroupSync(); } - // And finally, store that column to global memory - if( threadFlattenned >= height ) + // And finally, store that column to global memory. + // Only running that code on the threads of the group with thread.x = 0, to save a few loads from the groupshared buffer + // This allows to use registers instead, faster to access + if( thread.x != 0 ) return; - uint rdi = hadd( group.yz * resultStrides.zw ) + group.x * TILE_Y * resultStrides.x; - rdi += threadFlattenned * resultStrides.x; - result[ rdi ] = resTemp[ threadFlattenned ][ 0 ]; + uint rdi = hadd( group.yz * resultStrides.zw ); + rdi += ( group.x * TILE_Y + thread.y ) * resultStrides.x; + const uint rdiInc = THREADS_Y * resultStrides.x; + + [unroll] + for( i = 0; i < heightVectors; i++ ) + { + // The previous loop had "i > 1" continue condition, it didn't complete the last step of the reduction + // The following line is doing that last reduction step + const float4 resultVec = acc[ i ] + reductionBuffer[ i ][ thread.y ][ 1 ]; + + // Conditionally store these 4 floats to the output tensor + [unroll] + for( uint j = 0; j < 4; j++, rdi += rdiInc ) + { + const uint y = ( i * 4 + j ) * THREADS_Y + thread.y; + [branch] + if( y < height ) + result[ rdi ] = resultVec[ j ]; + } + } }
\ No newline at end of file |
