diff options
| author | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-16 14:52:43 +0100 |
| commit | 8c4603c73675958efc960fbd4bb599a2909d106a (patch) | |
| tree | 714dc6fc9a1672d5fd7f89676b97e10959662abc /ComputeShaders/softMax64.hlsl | |
| parent | 990a8d0dbaefc996244097397259e92758b15cce (diff) | |
Source codes
Diffstat (limited to 'ComputeShaders/softMax64.hlsl')
| -rw-r--r-- | ComputeShaders/softMax64.hlsl | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/ComputeShaders/softMax64.hlsl b/ComputeShaders/softMax64.hlsl new file mode 100644 index 0000000..7ecd2ef --- /dev/null +++ b/ComputeShaders/softMax64.hlsl @@ -0,0 +1,71 @@ +// Dispatch [ nr, 1, 1 ] thread groups of this shader +RWBuffer<float> result: register( u0 ); + +// table_exp_f16 +Buffer<uint> lookupTable: register( t0 ); + +cbuffer Constants: register( b0 ) +{ + uint4 elements: packoffset( c0 ); + uint4 strides: packoffset( c1 ); + uint nr: packoffset( c2.x ); + float inputScale: packoffset( c2.y ); +} + +#include "miscUtils.hlsli" +#include "groupReduce64.hlsli" + +static const float negativeInfinity = asfloat( 0xff800000 ); + +[ numthreads( 64, 1, 1 ) ] +void main( uint3 group: SV_GroupID, uint thread : SV_GroupIndex ) +{ + const uint p = group.x * strides[ 1 ]; + const uint nc = elements[ 0 ]; + const uint pEnd = p + nc; + uint i; + + float m = negativeInfinity; + for( i = p + thread; i < pEnd; i += 64 ) + m = max( m, result[ i ] ); + horizontalMaxBroadcast( thread, m ); + + float sum = 0; + for( i = p + thread; i < pEnd; i += 64 ) + { + float f = result[ i ]; + + [branch] + if( f != negativeInfinity ) + { + f = ( f - m ) * inputScale; +#if 1 + // At least on Radeon Graphics GPU inside Ryzen 7 5700G, computing exponent instead of loading from the buffer improves the performance + f = exp( f ); +#else + uint s = fp16Rounded( f ); + s = lookupTable[ s ]; + f = f16tof32( s ); +#endif + sum += f; + } + else + f = 0; + + result[ i ] = f; + } + + horizontalSum( thread, sum ); + if( 0 == thread ) + sharedAccumulators[ 0 ] = 1.0 / sum; + GroupMemoryBarrierWithGroupSync(); + const float scale = sharedAccumulators[ 0 ]; + + // ggml_vec_scale_f32 + for( i = p + thread; i < pEnd; i += 64 ) + { + float f = result[ i ]; + f *= scale; + result[ i ] = f; + } +}
\ No newline at end of file |
