diff options
| author | Konstantin <const@const.me> | 2023-01-23 16:44:52 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-23 16:44:52 +0100 |
| commit | 01325d7168669f8d05446314f8b53c62e7de3af9 (patch) | |
| tree | 49ff276c29b5e3b513ca61cf426603fd8ee3cc0b | |
| parent | 27dfc3428a7016e2d05dd67b6d8b88c0b982baa9 (diff) | |
Minor, micro-optimization
| -rw-r--r-- | Whisper/Hybrid/HybridContext.cpp | 4 | ||||
| -rw-r--r-- | Whisper/Utils/miscUtils.cpp | 24 | ||||
| -rw-r--r-- | Whisper/Utils/miscUtils.h | 5 | ||||
| -rw-r--r-- | Whisper/Whisper/WhisperContext.cpp | 6 |
4 files changed, 33 insertions, 6 deletions
diff --git a/Whisper/Hybrid/HybridContext.cpp b/Whisper/Hybrid/HybridContext.cpp index 64ed2e6..5811bbd 100644 --- a/Whisper/Hybrid/HybridContext.cpp +++ b/Whisper/Hybrid/HybridContext.cpp @@ -176,7 +176,7 @@ HRESULT HybridContext::decode( const int* tokens, const int n_tokens, const int { Tensor Qcur = ml.mulMat( layer.attnQuery.w, cur ); if( 0 == il ) Tracing::tensor( "dec-Qcur-0", Qcur ); - const float scaling = (float)pow( float( (int)n_state ) / (int)n_head, -0.25 ); + const float scaling = computeScaling( (int)n_state, (int)n_head ); ml.addRepeatScale( Qcur, layer.attnQuery.b, scaling ); if( 0 == il ) Tracing::tensor( "dec-Qcur-1", Qcur ); @@ -241,7 +241,7 @@ HRESULT HybridContext::decode( const int* tokens, const int n_tokens, const int // cross-attention { Tensor Qcur = ml.mulMat( layer.crossAttnQuery.w, cur ); - ml.addRepeatScale( Qcur, layer.crossAttnQuery.b, (float)pow( float( (int)n_state ) / (int)n_head, -0.25 ) ); + ml.addRepeatScale( Qcur, layer.crossAttnQuery.b, computeScaling( (int)n_state, (int)n_head ) ); // Kcross is already scaled const uint32_t len = M * n_state; diff --git a/Whisper/Utils/miscUtils.cpp b/Whisper/Utils/miscUtils.cpp index c3f7dd1..6644220 100644 --- a/Whisper/Utils/miscUtils.cpp +++ b/Whisper/Utils/miscUtils.cpp @@ -1,5 +1,6 @@ #include "stdafx.h" #include "miscUtils.h" +#include <cmath> void setCurrentThreadName( const char* threadName ) { @@ -30,4 +31,27 @@ void setCurrentThreadName( const char* threadName ) __except( EXCEPTION_EXECUTE_HANDLER ) { } +} + +float computeScaling( int mul, int div ) +{ +#ifdef _DEBUG + const float ref = (float)std::pow( (double)mul / (double)div, -0.25 ); +#endif + // Make int32 vector with both numbers + __m128i iv = _mm_cvtsi32_si128( mul ); + iv = _mm_insert_epi32( iv, div, 1 ); + // Convert both numbers to FP64 + __m128d v = _mm_cvtepi32_pd( iv ); + // Compute mul / div + v = _mm_div_sd( v, _mm_unpackhi_pd( v, v ) ); + // Square root + v = _mm_sqrt_sd( v, v ); + // 4-th root + v = _mm_sqrt_sd( v, v ); + // Invert the value + v = _mm_div_sd( _mm_set_sd( 1.0 ), v ); + // Downcast to FP32, and return the result + __m128 f32 = _mm_cvtsd_ss( _mm_setzero_ps(), v ); + return _mm_cvtss_f32( f32 ); }
\ No newline at end of file diff --git a/Whisper/Utils/miscUtils.h b/Whisper/Utils/miscUtils.h index d665cbc..ff52e21 100644 --- a/Whisper/Utils/miscUtils.h +++ b/Whisper/Utils/miscUtils.h @@ -78,4 +78,7 @@ template<class E> inline size_t vectorMemoryUse( const std::vector<E>& vec ) { return sizeof( E ) * vec.capacity(); -}
\ No newline at end of file +} + +// The formula is pow( mul / div, -0.25 ) +float computeScaling( int mul, int div );
\ No newline at end of file diff --git a/Whisper/Whisper/WhisperContext.cpp b/Whisper/Whisper/WhisperContext.cpp index e694930..823d207 100644 --- a/Whisper/Whisper/WhisperContext.cpp +++ b/Whisper/Whisper/WhisperContext.cpp @@ -358,7 +358,7 @@ Tensor WhisperContext::encode( Whisper::iSpectrogram& spectrogram, const sEncode const size_t layersCount = encParams.n_text_layer; const uint32_t stride = encParams.n_state * encParams.n_ctx; - const float finalScaling = (float)pow( float( encParams.n_state ) / encParams.n_head, -0.25 ); + const float finalScaling = computeScaling( (int)encParams.n_state, (int)encParams.n_head ); for( size_t i = 0; i < layersCount; i++ ) { const LayerDecoder& layer = gpuModel.dec.layers[ i ]; @@ -422,7 +422,7 @@ Tensor WhisperContext::decodeLayer( const Tensor& inpL, size_t il, const sLayerD profiler.setNextTag( "dec.layer.1" ); Tensor Qcur = mulMat( layer.attnQuery.w, cur ); if( 0 == il ) Tracing::tensor( "dec-Qcur-0", Qcur ); - const float scaling = (float)pow( float( (int)ldp.n_state ) / (int)ldp.n_head, -0.25 ); + const float scaling = computeScaling( (int)ldp.n_state, (int)ldp.n_head ); addRepeatScale( Qcur, layer.attnQuery.b, scaling ); if( 0 == il ) Tracing::tensor( "dec-Qcur-1", Qcur ); @@ -494,7 +494,7 @@ Tensor WhisperContext::decodeLayer( const Tensor& inpL, size_t il, const sLayerD { profiler.setNextTag( "dec.layer.7" ); Tensor Qcur = mulMat( layer.crossAttnQuery.w, cur ); - addRepeatScale( Qcur, layer.crossAttnQuery.b, (float)pow( float( (int)ldp.n_state ) / (int)ldp.n_head, -0.25 ) ); + addRepeatScale( Qcur, layer.crossAttnQuery.b, computeScaling( (int)ldp.n_state, (int)ldp.n_head ) ); // Kcross is already scaled const uint32_t len = ldp.M * ldp.n_state; |
