From 01325d7168669f8d05446314f8b53c62e7de3af9 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Mon, 23 Jan 2023 16:44:52 +0100 Subject: Minor, micro-optimization --- Whisper/Hybrid/HybridContext.cpp | 4 ++-- Whisper/Utils/miscUtils.cpp | 24 ++++++++++++++++++++++++ Whisper/Utils/miscUtils.h | 5 ++++- Whisper/Whisper/WhisperContext.cpp | 6 +++--- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/Whisper/Hybrid/HybridContext.cpp b/Whisper/Hybrid/HybridContext.cpp index 64ed2e6..5811bbd 100644 --- a/Whisper/Hybrid/HybridContext.cpp +++ b/Whisper/Hybrid/HybridContext.cpp @@ -176,7 +176,7 @@ HRESULT HybridContext::decode( const int* tokens, const int n_tokens, const int { Tensor Qcur = ml.mulMat( layer.attnQuery.w, cur ); if( 0 == il ) Tracing::tensor( "dec-Qcur-0", Qcur ); - const float scaling = (float)pow( float( (int)n_state ) / (int)n_head, -0.25 ); + const float scaling = computeScaling( (int)n_state, (int)n_head ); ml.addRepeatScale( Qcur, layer.attnQuery.b, scaling ); if( 0 == il ) Tracing::tensor( "dec-Qcur-1", Qcur ); @@ -241,7 +241,7 @@ HRESULT HybridContext::decode( const int* tokens, const int n_tokens, const int // cross-attention { Tensor Qcur = ml.mulMat( layer.crossAttnQuery.w, cur ); - ml.addRepeatScale( Qcur, layer.crossAttnQuery.b, (float)pow( float( (int)n_state ) / (int)n_head, -0.25 ) ); + ml.addRepeatScale( Qcur, layer.crossAttnQuery.b, computeScaling( (int)n_state, (int)n_head ) ); // Kcross is already scaled const uint32_t len = M * n_state; diff --git a/Whisper/Utils/miscUtils.cpp b/Whisper/Utils/miscUtils.cpp index c3f7dd1..6644220 100644 --- a/Whisper/Utils/miscUtils.cpp +++ b/Whisper/Utils/miscUtils.cpp @@ -1,5 +1,6 @@ #include "stdafx.h" #include "miscUtils.h" +#include void setCurrentThreadName( const char* threadName ) { @@ -30,4 +31,27 @@ void setCurrentThreadName( const char* threadName ) __except( EXCEPTION_EXECUTE_HANDLER ) { } +} + +float computeScaling( int mul, int div ) +{ +#ifdef _DEBUG + const float ref = (float)std::pow( (double)mul / (double)div, -0.25 ); +#endif + // Make int32 vector with both numbers + __m128i iv = _mm_cvtsi32_si128( mul ); + iv = _mm_insert_epi32( iv, div, 1 ); + // Convert both numbers to FP64 + __m128d v = _mm_cvtepi32_pd( iv ); + // Compute mul / div + v = _mm_div_sd( v, _mm_unpackhi_pd( v, v ) ); + // Square root + v = _mm_sqrt_sd( v, v ); + // 4-th root + v = _mm_sqrt_sd( v, v ); + // Invert the value + v = _mm_div_sd( _mm_set_sd( 1.0 ), v ); + // Downcast to FP32, and return the result + __m128 f32 = _mm_cvtsd_ss( _mm_setzero_ps(), v ); + return _mm_cvtss_f32( f32 ); } \ No newline at end of file diff --git a/Whisper/Utils/miscUtils.h b/Whisper/Utils/miscUtils.h index d665cbc..ff52e21 100644 --- a/Whisper/Utils/miscUtils.h +++ b/Whisper/Utils/miscUtils.h @@ -78,4 +78,7 @@ template inline size_t vectorMemoryUse( const std::vector& vec ) { return sizeof( E ) * vec.capacity(); -} \ No newline at end of file +} + +// The formula is pow( mul / div, -0.25 ) +float computeScaling( int mul, int div ); \ No newline at end of file diff --git a/Whisper/Whisper/WhisperContext.cpp b/Whisper/Whisper/WhisperContext.cpp index e694930..823d207 100644 --- a/Whisper/Whisper/WhisperContext.cpp +++ b/Whisper/Whisper/WhisperContext.cpp @@ -358,7 +358,7 @@ Tensor WhisperContext::encode( Whisper::iSpectrogram& spectrogram, const sEncode const size_t layersCount = encParams.n_text_layer; const uint32_t stride = encParams.n_state * encParams.n_ctx; - const float finalScaling = (float)pow( float( encParams.n_state ) / encParams.n_head, -0.25 ); + const float finalScaling = computeScaling( (int)encParams.n_state, (int)encParams.n_head ); for( size_t i = 0; i < layersCount; i++ ) { const LayerDecoder& layer = gpuModel.dec.layers[ i ]; @@ -422,7 +422,7 @@ Tensor WhisperContext::decodeLayer( const Tensor& inpL, size_t il, const sLayerD profiler.setNextTag( "dec.layer.1" ); Tensor Qcur = mulMat( layer.attnQuery.w, cur ); if( 0 == il ) Tracing::tensor( "dec-Qcur-0", Qcur ); - const float scaling = (float)pow( float( (int)ldp.n_state ) / (int)ldp.n_head, -0.25 ); + const float scaling = computeScaling( (int)ldp.n_state, (int)ldp.n_head ); addRepeatScale( Qcur, layer.attnQuery.b, scaling ); if( 0 == il ) Tracing::tensor( "dec-Qcur-1", Qcur ); @@ -494,7 +494,7 @@ Tensor WhisperContext::decodeLayer( const Tensor& inpL, size_t il, const sLayerD { profiler.setNextTag( "dec.layer.7" ); Tensor Qcur = mulMat( layer.crossAttnQuery.w, cur ); - addRepeatScale( Qcur, layer.crossAttnQuery.b, (float)pow( float( (int)ldp.n_state ) / (int)ldp.n_head, -0.25 ) ); + addRepeatScale( Qcur, layer.crossAttnQuery.b, computeScaling( (int)ldp.n_state, (int)ldp.n_head ) ); // Kcross is already scaled const uint32_t len = ldp.M * ldp.n_state; -- cgit v1.2.3