summaryrefslogtreecommitdiffstats
path: root/ComputeShaders
diff options
context:
space:
mode:
authorKonstantin <const@const.me>2023-01-23 14:38:12 +0100
committerKonstantin <const@const.me>2023-01-23 14:38:12 +0100
commit27dfc3428a7016e2d05dd67b6d8b88c0b982baa9 (patch)
treef969d54ebfb266ecf61285a039295a1da37200a0 /ComputeShaders
parent01aba39f15a03ed96e034ffc3b6ee9ec12294b0d (diff)
Performance improvement, `softMax` shader
Diffstat (limited to 'ComputeShaders')
-rw-r--r--ComputeShaders/ComputeShaders.vcxproj1
-rw-r--r--ComputeShaders/ComputeShaders.vcxproj.filters1
-rw-r--r--ComputeShaders/softMax.hlsl61
-rw-r--r--ComputeShaders/softMax64.hlsl73
-rw-r--r--ComputeShaders/softMaxLong.hlsl6
5 files changed, 55 insertions, 87 deletions
diff --git a/ComputeShaders/ComputeShaders.vcxproj b/ComputeShaders/ComputeShaders.vcxproj
index 1d9343d..300974a 100644
--- a/ComputeShaders/ComputeShaders.vcxproj
+++ b/ComputeShaders/ComputeShaders.vcxproj
@@ -202,6 +202,7 @@
<FxCompile Include="softMax64.hlsl" />
<FxCompile Include="softMaxCompat.hlsl" />
<FxCompile Include="softMaxFixed.hlsl" />
+ <FxCompile Include="softMaxLong.hlsl" />
<FxCompile Include="zeroMemory.hlsl" />
</ItemGroup>
<ItemGroup>
diff --git a/ComputeShaders/ComputeShaders.vcxproj.filters b/ComputeShaders/ComputeShaders.vcxproj.filters
index b827710..de9f12f 100644
--- a/ComputeShaders/ComputeShaders.vcxproj.filters
+++ b/ComputeShaders/ComputeShaders.vcxproj.filters
@@ -51,6 +51,7 @@
<FxCompile Include="matReshapePanels.hlsl" />
<FxCompile Include="mulMatByRowTiledEx.hlsl" />
<FxCompile Include="addRepeatEx.hlsl" />
+ <FxCompile Include="softMaxLong.hlsl" />
</ItemGroup>
<ItemGroup>
<None Include="componentwiseBinaryOp.hlsli" />
diff --git a/ComputeShaders/softMax.hlsl b/ComputeShaders/softMax.hlsl
index 6ebd0f2..259e457 100644
--- a/ComputeShaders/softMax.hlsl
+++ b/ComputeShaders/softMax.hlsl
@@ -1,9 +1,6 @@
// Dispatch [ nr, 1, 1 ] thread groups of this shader
RWBuffer<float> result: register( u0 );
-// table_exp_f16
-Buffer<uint> lookupTable: register( t0 );
-
cbuffer Constants: register( b0 )
{
uint4 elements: packoffset( c0 );
@@ -12,12 +9,50 @@ cbuffer Constants: register( b0 )
float inputScale: packoffset( c2.y );
}
-#include "miscUtils.hlsli"
-#include "groupReduce.hlsli"
+#ifndef THREADS
+static const uint THREADS = 32;
+#endif
+
+groupshared float sharedAccumulators[ THREADS ];
+
+// Compute horizontal maximum of the numbers, and broadcast to all threads of the group.
+void horizontalMaxBroadcast( const uint thread, inout float ax )
+{
+ sharedAccumulators[ thread ] = ax;
+ for( uint i = THREADS / 2; i > 0; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ ax = max( ax, sharedAccumulators[ thread + i ] );
+ sharedAccumulators[ thread ] = ax;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ ax = sharedAccumulators[ 0 ];
+}
+
+// Compute horisontal sum of the numbers. The result is only correct on the thread #0 of the group.
+void horizontalSum( const uint thread, inout float sum )
+{
+ sharedAccumulators[ thread ] = sum;
+ for( uint i = THREADS / 2; i > 1; i /= 2 )
+ {
+ GroupMemoryBarrierWithGroupSync();
+ if( thread < i )
+ {
+ sum += sharedAccumulators[ thread + i ];
+ sharedAccumulators[ thread ] = sum;
+ }
+ }
+ GroupMemoryBarrierWithGroupSync();
+ if( 0 == thread )
+ sum += sharedAccumulators[ 1 ];
+}
static const float negativeInfinity = asfloat( 0xff800000 );
-[ numthreads( 32, 1, 1 ) ]
+[numthreads( THREADS, 1, 1 )]
void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
{
const uint p = group.x * strides[ 1 ];
@@ -26,12 +61,12 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
uint i;
float m = negativeInfinity;
- for( i = p + thread; i < pEnd; i += 32 )
+ for( i = p + thread; i < pEnd; i += THREADS )
m = max( m, result[ i ] );
horizontalMaxBroadcast( thread, m );
float sum = 0;
- for( i = p + thread; i < pEnd; i += 32 )
+ for( i = p + thread; i < pEnd; i += THREADS )
{
float f = result[ i ];
@@ -39,14 +74,8 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
if( f != negativeInfinity )
{
f = ( f - m ) * inputScale;
-#if 1
- // Similar to Radeon Graphics, computing the exponent on nVidia 1080Ti is also slightly faster than loading from the lookup table
+ // On both Radeon Graphics and nVidia 1080Ti, computing the exponent is slightly faster than loading from the lookup table
f = exp( f );
-#else
- uint s = fp16Rounded( f );
- s = lookupTable[ s ];
- f = f16tof32( s );
-#endif
sum += f;
}
else
@@ -62,7 +91,7 @@ void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
const float scale = sharedAccumulators[ 0 ];
// ggml_vec_scale_f32
- for( i = p + thread; i < pEnd; i += 32 )
+ for( i = p + thread; i < pEnd; i += THREADS )
{
float f = result[ i ];
f *= scale;
diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl
index 7ecd2ef..f73551e 100644
--- a/ComputeShaders/softMax64.hlsl
+++ b/ComputeShaders/softMax64.hlsl
@@ -1,71 +1,2 @@
-// Dispatch [ nr, 1, 1 ] thread groups of this shader
-RWBuffer<float> result: register( u0 );
-
-// table_exp_f16
-Buffer<uint> lookupTable: register( t0 );
-
-cbuffer Constants: register( b0 )
-{
- uint4 elements: packoffset( c0 );
- uint4 strides: packoffset( c1 );
- uint nr: packoffset( c2.x );
- float inputScale: packoffset( c2.y );
-}
-
-#include "miscUtils.hlsli"
-#include "groupReduce64.hlsli"
-
-static const float negativeInfinity = asfloat( 0xff800000 );
-
-[ numthreads( 64, 1, 1 ) ]
-void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex )
-{
- const uint p = group.x * strides[ 1 ];
- const uint nc = elements[ 0 ];
- const uint pEnd = p + nc;
- uint i;
-
- float m = negativeInfinity;
- for( i = p + thread; i < pEnd; i += 64 )
- m = max( m, result[ i ] );
- horizontalMaxBroadcast( thread, m );
-
- float sum = 0;
- for( i = p + thread; i < pEnd; i += 64 )
- {
- float f = result[ i ];
-
- [branch]
- if( f != negativeInfinity )
- {
- f = ( f - m ) * inputScale;
-#if 1
- // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance
- f = exp( f );
-#else
- uint s = fp16Rounded( f );
- s = lookupTable[ s ];
- f = f16tof32( s );
-#endif
- sum += f;
- }
- else
- f = 0;
-
- result[ i ] = f;
- }
-
- horizontalSum( thread, sum );
- if( 0 == thread )
- sharedAccumulators[ 0 ] = 1.0 / sum;
- GroupMemoryBarrierWithGroupSync();
- const float scale = sharedAccumulators[ 0 ];
-
- // ggml_vec_scale_f32
- for( i = p + thread; i < pEnd; i += 64 )
- {
- float f = result[ i ];
- f *= scale;
- result[ i ] = f;
- }
-} \ No newline at end of file
+#define THREADS 64
+#include "softMax.hlsl" \ No newline at end of file
diff --git a/ComputeShaders/softMaxLong.hlsl b/ComputeShaders/softMaxLong.hlsl
new file mode 100644
index 0000000..1f2c2be
--- /dev/null
+++ b/ComputeShaders/softMaxLong.hlsl
@@ -0,0 +1,6 @@
+// This version is for the "dec.probs" shader tag
+// The input tensor has a size [ 51865, 3 ], a very long tensor with just 3 rows.
+// Despite the shader only runs on 3 GPU cores, large count of threads helps substantially, this shader is about 50% faster.
+#define THREADS 1024
+
+#include "softMax.hlsl" \ No newline at end of file