diff options
| author | Konstantin <const@const.me> | 2023-01-23 14:38:12 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-23 14:38:12 +0100 |
| commit | 27dfc3428a7016e2d05dd67b6d8b88c0b982baa9 (patch) | |
| tree | f969d54ebfb266ecf61285a039295a1da37200a0 | |
| parent | 01aba39f15a03ed96e034ffc3b6ee9ec12294b0d (diff) | |
Performance improvement, `softMax` shader
| -rw-r--r-- | ComputeShaders/ComputeShaders.vcxproj | 1 | ||||
| -rw-r--r-- | ComputeShaders/ComputeShaders.vcxproj.filters | 1 | ||||
| -rw-r--r-- | ComputeShaders/softMax.hlsl | 61 | ||||
| -rw-r--r-- | ComputeShaders/softMax64.hlsl | 73 | ||||
| -rw-r--r-- | ComputeShaders/softMaxLong.hlsl | 6 | ||||
| -rw-r--r-- | Whisper/D3D/shaderNames.cpp | 3 | ||||
| -rw-r--r-- | Whisper/D3D/shaderNames.h | 3 | ||||
| -rw-r--r-- | Whisper/ML/MlContext.cpp | 10 |
8 files changed, 68 insertions, 90 deletions
diff --git a/ComputeShaders/ComputeShaders.vcxproj b/ComputeShaders/ComputeShaders.vcxproj index 1d9343d..300974a 100644 --- a/ComputeShaders/ComputeShaders.vcxproj +++ b/ComputeShaders/ComputeShaders.vcxproj @@ -202,6 +202,7 @@ <FxCompile Include="softMax64.hlsl" /> <FxCompile Include="softMaxCompat.hlsl" /> <FxCompile Include="softMaxFixed.hlsl" /> + <FxCompile Include="softMaxLong.hlsl" /> <FxCompile Include="zeroMemory.hlsl" /> </ItemGroup> <ItemGroup> diff --git a/ComputeShaders/ComputeShaders.vcxproj.filters b/ComputeShaders/ComputeShaders.vcxproj.filters index b827710..de9f12f 100644 --- a/ComputeShaders/ComputeShaders.vcxproj.filters +++ b/ComputeShaders/ComputeShaders.vcxproj.filters @@ -51,6 +51,7 @@ <FxCompile Include="matReshapePanels.hlsl" /> <FxCompile Include="mulMatByRowTiledEx.hlsl" /> <FxCompile Include="addRepeatEx.hlsl" /> + <FxCompile Include="softMaxLong.hlsl" /> </ItemGroup> <ItemGroup> <None Include="componentwiseBinaryOp.hlsli" /> diff --git a/ComputeShaders/softMax.hlsl b/ComputeShaders/softMax.hlsl index 6ebd0f2..259e457 100644 --- a/ComputeShaders/softMax.hlsl +++ b/ComputeShaders/softMax.hlsl @@ -1,9 +1,6 @@ // Dispatch [ nr, 1, 1 ] thread groups of this shader RWBuffer<float> result: register( u0 ); -// table_exp_f16 -Buffer<uint> lookupTable: register( t0 ); - cbuffer Constants: register( b0 ) { uint4 elements: packoffset( c0 ); @@ -12,12 +9,50 @@ cbuffer Constants: register( b0 ) float inputScale: packoffset( c2.y ); } -#include "miscUtils.hlsli" -#include "groupReduce.hlsli" +#ifndef THREADS +static const uint THREADS = 32; +#endif + +groupshared float sharedAccumulators[ THREADS ]; + +// Compute horizontal maximum of the numbers, and broadcast to all threads of the group. +void horizontalMaxBroadcast( const uint thread, inout float ax ) +{ + sharedAccumulators[ thread ] = ax; + for( uint i = THREADS / 2; i > 0; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + ax = max( ax, sharedAccumulators[ thread + i ] ); + sharedAccumulators[ thread ] = ax; + } + } + GroupMemoryBarrierWithGroupSync(); + ax = sharedAccumulators[ 0 ]; +} + +// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group. +void horizontalSum( const uint thread, inout float sum ) +{ + sharedAccumulators[ thread ] = sum; + for( uint i = THREADS / 2; i > 1; i /= 2 ) + { + GroupMemoryBarrierWithGroupSync(); + if( thread < i ) + { + sum += sharedAccumulators[ thread + i ]; + sharedAccumulators[ thread ] = sum; + } + } + GroupMemoryBarrierWithGroupSync(); + if( 0 == thread ) + sum += sharedAccumulators[ 1 ]; +} static const float negativeInfinity = asfloat( 0xff800000 ); -[ numthreads( 32, 1, 1 ) ] +[numthreads( THREADS, 1, 1 )] void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) { const uint p = group.x * strides[ 1 ]; @@ -26,12 +61,12 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) uint i; float m = negativeInfinity; - for( i = p + thread; i < pEnd; i += 32 ) + for( i = p + thread; i < pEnd; i += THREADS ) m = max( m, result[ i ] ); horizontalMaxBroadcast( thread, m ); float sum = 0; - for( i = p + thread; i < pEnd; i += 32 ) + for( i = p + thread; i < pEnd; i += THREADS ) { float f = result[ i ]; @@ -39,14 +74,8 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) if( f != negativeInfinity ) { f = ( f - m ) * inputScale; -#if 1 - // Similar to Radeon Graphics, computing the exponent on nVidia 1080Ti is also slightly faster than loading from the lookup table + // On both Radeon Graphics and nVidia 1080Ti, computing the exponent is slightly faster than loading from the lookup table f = exp( f ); -#else - uint s = fp16Rounded( f ); - s = lookupTable[ s ]; - f = f16tof32( s ); -#endif sum += f; } else @@ -62,7 +91,7 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) const float scale = sharedAccumulators[ 0 ]; // ggml_vec_scale_f32 - for( i = p + thread; i < pEnd; i += 32 ) + for( i = p + thread; i < pEnd; i += THREADS ) { float f = result[ i ]; f *= scale; diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl index 7ecd2ef..f73551e 100644 --- a/ComputeShaders/softMax64.hlsl +++ b/ComputeShaders/softMax64.hlsl @@ -1,71 +1,2 @@ -// Dispatch [ nr, 1, 1 ] thread groups of this shader -RWBuffer<float> result: register( u0 ); - -// table_exp_f16 -Buffer<uint> lookupTable: register( t0 ); - -cbuffer Constants: register( b0 ) -{ - uint4 elements: packoffset( c0 ); - uint4 strides: packoffset( c1 ); - uint nr: packoffset( c2.x ); - float inputScale: packoffset( c2.y ); -} - -#include "miscUtils.hlsli" -#include "groupReduce64.hlsli" - -static const float negativeInfinity = asfloat( 0xff800000 ); - -[ numthreads( 64, 1, 1 ) ] -void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) -{ - const uint p = group.x * strides[ 1 ]; - const uint nc = elements[ 0 ]; - const uint pEnd = p + nc; - uint i; - - float m = negativeInfinity; - for( i = p + thread; i < pEnd; i += 64 ) - m = max( m, result[ i ] ); - horizontalMaxBroadcast( thread, m ); - - float sum = 0; - for( i = p + thread; i < pEnd; i += 64 ) - { - float f = result[ i ]; - - [branch] - if( f != negativeInfinity ) - { - f = ( f - m ) * inputScale; -#if 1 - // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance - f = exp( f ); -#else - uint s = fp16Rounded( f ); - s = lookupTable[ s ]; - f = f16tof32( s ); -#endif - sum += f; - } - else - f = 0; - - result[ i ] = f; - } - - horizontalSum( thread, sum ); - if( 0 == thread ) - sharedAccumulators[ 0 ] = 1.0 / sum; - GroupMemoryBarrierWithGroupSync(); - const float scale = sharedAccumulators[ 0 ]; - - // ggml_vec_scale_f32 - for( i = p + thread; i < pEnd; i += 64 ) - { - float f = result[ i ]; - f *= scale; - result[ i ] = f; - } -}
\ No newline at end of file +#define THREADS 64 +#include "softMax.hlsl"
\ No newline at end of file diff --git a/ComputeShaders/softMaxLong.hlsl b/ComputeShaders/softMaxLong.hlsl new file mode 100644 index 0000000..1f2c2be --- /dev/null +++ b/ComputeShaders/softMaxLong.hlsl @@ -0,0 +1,6 @@ +// This version is for the "dec.probs" shader tag +// The input tensor has a size [ 51865, 3 ], a very long tensor with just 3 rows. +// Despite the shader only runs on 3 GPU cores, large count of threads helps substantially, this shader is about 50% faster. +#define THREADS 1024 + +#include "softMax.hlsl"
\ No newline at end of file diff --git a/Whisper/D3D/shaderNames.cpp b/Whisper/D3D/shaderNames.cpp index 0605828..a631f08 100644 --- a/Whisper/D3D/shaderNames.cpp +++ b/Whisper/D3D/shaderNames.cpp @@ -2,7 +2,7 @@ #include "stdafx.h" #include "shaderNames.h" -static const std::array<const char*, 39> s_shaderNames = +static const std::array<const char*, 40> s_shaderNames = { "add", "addInPlace", @@ -42,6 +42,7 @@ static const std::array<const char*, 39> s_shaderNames = "softMax", "softMaxCompat", "softMaxFixed", + "softMaxLong", "zeroMemory", }; diff --git a/Whisper/D3D/shaderNames.h b/Whisper/D3D/shaderNames.h index 5942e72..80fc4fa 100644 --- a/Whisper/D3D/shaderNames.h +++ b/Whisper/D3D/shaderNames.h @@ -44,7 +44,8 @@ namespace DirectCompute softMax = 35, softMaxCompat = 36, softMaxFixed = 37, - zeroMemory = 38, + softMaxLong = 38, + zeroMemory = 39, }; const char* computeShaderName( eComputeShader cs ); diff --git a/Whisper/ML/MlContext.cpp b/Whisper/ML/MlContext.cpp index 6eeae09..a226999 100644 --- a/Whisper/ML/MlContext.cpp +++ b/Whisper/ML/MlContext.cpp @@ -556,7 +556,15 @@ void MlContext::softMax( Tensor& a, float inputScale ) printSizes.print( a ); #endif constexpr uint32_t FIXED_ROW_SIZE = 1500; - eComputeShader cs = ( a.ne[ 0 ] == FIXED_ROW_SIZE ) ? eComputeShader::softMaxFixed : eComputeShader::softMax; + + eComputeShader cs; + if( a.ne[ 0 ] == FIXED_ROW_SIZE ) + cs = eComputeShader::softMaxFixed; + else if( a.ne[ 0 ] >= ( 1024 * 4 ) ) + cs = eComputeShader::softMaxLong; + else + cs = eComputeShader::softMax; + bindShader( cs ); const uint32_t nr = a.countRows(); TensorShape dummyShape; |
