summaryrefslogtreecommitdiffstats
path: root/ComputeShaders/mulMatByRowTiledEx.hlsl
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-24 16:23:50 +0100
committerKonstantin <const@const.me>2023-01-24 16:23:50 +0100
commit9d6fb00973218ebc397e15819be3964423b81f91 (patch)
tree5337753bb204e827ae8d100cdb648ae61ee7889d /ComputeShaders/mulMatByRowTiledEx.hlsl
parent43e91802254a9b649925fa66d2e15808943e5aaa (diff)
GPU performance, matrix multiplication shaders
Diffstat (limited to 'ComputeShaders/mulMatByRowTiledEx.hlsl')
-rw-r--r--ComputeShaders/mulMatByRowTiledEx.hlsl159
1 files changed, 63 insertions, 96 deletions
diff --git a/ComputeShaders/mulMatByRowTiledEx.hlsl b/ComputeShaders/mulMatByRowTiledEx.hlsl
index d377b8c..bc0c0c6 100644
--- a/ComputeShaders/mulMatByRowTiledEx.hlsl
+++ b/ComputeShaders/mulMatByRowTiledEx.hlsl
@@ -2,9 +2,6 @@
#ifndef TILE_SIZE
static const uint TILE_SIZE = 32;
#endif
-#ifndef TILE_HEIGHT
-static const uint TILE_HEIGHT = 32;
-#endif
#ifndef THREADS_Y
static const uint THREADS_Y = 16;
#endif
@@ -27,130 +24,100 @@ cbuffer Constants: register( b0 )
uint4 resultStrides: packoffset( c5 );
}
-groupshared float tileOutput[ THREADS_Y ][ TILE_SIZE ];
-groupshared float tile0[ TILE_HEIGHT ][ TILE_SIZE ];
-groupshared float tile1[ TILE_HEIGHT ];
-
-void multiplyTiles( const uint3 thread )
+inline uint hadd4( const uint4 v )
{
- float r = 0.0;
- for( uint i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
- {
- float a = tile0[ i ][ thread.x ];
- float b = tile1[ i ];
- r = mad( a, b, r );
- }
- tileOutput[ thread.y ][ thread.x ] += r;
+ const uint2 v2 = v.xy + v.zw;
+ return v2.x + v2.y;
}
-void reduceOutput( const uint3 thread )
+inline float hadd4( const float4 v )
{
- float curr = 0.0;
- [branch]
- if( thread.y < THREADS_Y / 2 )
- curr = tileOutput[ thread.y ][ thread.x ];
-
- for( uint i = THREADS_Y / 2; i > 0; i /= 2 )
- {
- [branch]
- if( thread.y < i )
- {
- curr += tileOutput[ thread.y + i ][ thread.x ];
- tileOutput[ thread.y ][ thread.x ] = curr;
- }
- GroupMemoryBarrierWithGroupSync();
- }
+ const float2 v2 = v.xy + v.zw;
+ return v2.x + v2.y;
}
-void storeTile( const uint threadFlat, const uint4 pos, const uint size )
-{
- if( threadFlat >= size )
- return;
- const uint4 prod4 = pos * resultStrides;
- const uint2 prod2 = prod4.xy + prod4.zw;
- uint rdi = prod2.x + prod2.y;
- result[ rdi + threadFlat ] = tileOutput[ 0 ][ threadFlat ];
-}
+groupshared float reductionBuffer[ THREADS_Y ][ TILE_SIZE ];
-[ numthreads( TILE_SIZE, THREADS_Y, 1 ) ]
-void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID, uint threadFlat : SV_GroupIndex )
+[numthreads( TILE_SIZE, THREADS_Y, 1 )]
+void main( const uint3 group: SV_GroupID, const uint3 thread : SV_GroupThreadID )
{
- uint i;
- // Zero all 3 shared buffers
- tileOutput[ thread.y ][ thread.x ] = 0.0;
- for( i = thread.y; i < TILE_HEIGHT; i += THREADS_Y )
- tile0[ i ][ thread.x ] = 0.0;
- if( threadFlat < THREADS_Y )
- tile1[ threadFlat ] = 0.0;
-
const uint2 layer = group.yz;
- uint rsi0 = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
- uint rsi1 = layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+ // Source offsets for the complete thread group
+ uint2 rsi;
+ rsi.x = group.x * arg0panel + layer.x * arg0LayerStrides.x + layer.y * arg0LayerStrides.y;
+ rsi.y = layer.x * arg1Strides.z + layer.y * arg1Strides.w;
+ // Apply source offsets for this particular thread
+ rsi.x += thread.y * TILE_SIZE + thread.x;
+ rsi.y += thread.y * arg1Strides.x;
- const uint threadOffset = thread.y * TILE_SIZE + thread.x;
- rsi0 += threadOffset;
- rsi1 += threadFlat * arg1Strides.x;
+ const uint2 rsiInc = uint2( THREADS_Y * TILE_SIZE, THREADS_Y * arg1Strides.x );
- const uint completeTiles = arg0Size.x / TILE_HEIGHT;
+ const uint completeTiles = arg0Size.x / ( THREADS_Y * 4 );
+ uint i;
+ float4 acc = 0.0;
for( i = 0; i < completeTiles; i++ )
{
- // Load [ TILE_SIZE, TILE_HEIGHT ] block from the first source tensor into the groupshared buffer
- for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+ // Each iteration of this loop consumes THREADS_Y*4 columns from the arg0 panel, and THREADS_Y*4 values from arg1
+ float4 v0, v1;
+ [unroll]
+ for( uint j = 0; j < 4; j++, rsi += rsiInc )
{
- tile0[ j ][ thread.x ] = arg0[ rsi0 ];
- rsi0 += THREADS_Y * TILE_SIZE;
+ // Load [ TILE_SIZE, THREADS_Y ] block from the first source tensor
+ v0[ j ] = arg0[ rsi.x ];
+ // Broadcast [ THREADS_Y ] row from the second source tensor
+ v1[ j ] = arg1[ rsi.y ];
}
- // Load [ TILE_HEIGHT ] row from the second source into another groupshared buffer
- [ branch ]
- if( threadFlat < TILE_HEIGHT )
- tile1[ threadFlat ] = arg1[ rsi1 ];
- rsi1 += TILE_HEIGHT * arg1Strides.x;
-
- GroupMemoryBarrierWithGroupSync();
-
- multiplyTiles( thread );
- GroupMemoryBarrierWithGroupSync();
+ // Now we have [ TILE_SIZE, THREADS_Y * 4 ] block from the first source tensor in the v0 vector,
+ // and [ THREADS_Y * 4 ] row from the second one in the v1 vector
+ // Multiply and accumulate.
+ acc = mad( v0, v1, acc );
}
- const uint rem = arg0Size.x % TILE_HEIGHT;
+ // Handle the remainder columns, if any.
+ // When present, their count is in [ 1 .. THREADS_Y * 4 - 1 ] interval
+ const uint rem = arg0Size.x % ( THREADS_Y * 4 );
if( rem != 0 )
{
- for( uint j = thread.y; j < TILE_HEIGHT; j += THREADS_Y )
+ float4 v0 = 0.0, v1 = 0.0;
+ [unroll]
+ for( uint j = 0; j < 4; j++, rsi += rsiInc )
{
- float a;
- [branch]
- if( j < rem )
+ const uint x = ( j * THREADS_Y ) + thread.y;
+ if( x < rem )
{
- a = arg0[ rsi0 ];
- rsi0 += THREADS_Y * TILE_SIZE;
+ v0[ j ] = arg0[ rsi.x ];
+ v1[ j ] = arg1[ rsi.y ];
}
- else
- a = 0.0;
- tile0[ j ][ thread.x ] = a;
}
+ acc = mad( v0, v1, acc );
+ }
- if( threadFlat < TILE_HEIGHT )
+ // We now have [ TILE_SIZE, THREADS_Y * 4 ] block in the local variables of this thread group
+ // The group however only outputs [ TILE_SIZE ] elements max, need a reduction
+ float acc1 = hadd4( acc );
+ reductionBuffer[ thread.y ][ thread.x ] = acc1;
+ GroupMemoryBarrierWithGroupSync();
+
+ for( i = THREADS_Y / 2; i > 1; i /= 2 )
+ {
+ if( thread.y < i )
{
- float b;
- [branch]
- if( threadFlat < rem )
- b = arg1[ rsi1 ];
- else
- b = 0.0;
- tile1[ threadFlat ] = b;
+ acc1 += reductionBuffer[ thread.y + i ][ thread.x ];
+ reductionBuffer[ thread.y ][ thread.x ] = acc1;
}
-
- GroupMemoryBarrierWithGroupSync();
-
- multiplyTiles( thread );
-
GroupMemoryBarrierWithGroupSync();
}
- reduceOutput( thread );
+ if( thread.y != 0 )
+ return;
const uint resultPos = group.x * TILE_SIZE;
const uint outputSize = min( TILE_SIZE, resultSize.x - resultPos );
- storeTile( threadFlat, uint4( resultPos, 0, layer ), outputSize );
+ if( thread.x >= outputSize )
+ return;
+
+ const uint4 resultPos4 = uint4( resultPos + thread.x, 0, layer );
+ const uint rdi = hadd4( resultPos4 * resultStrides );
+ result[ rdi ] = acc1 + reductionBuffer[ 1 ][ thread.x ];
} \ No newline at end of file