From 01325d7168669f8d05446314f8b53c62e7de3af9 Mon Sep 17 00:00:00 2001
From: Konstantin <const@const.me>
Date: Mon, 23 Jan 2023 16:44:52 +0100
Subject: Minor, micro-optimization

---
 Whisper/Hybrid/HybridContext.cpp   |  4 ++--
 Whisper/Utils/miscUtils.cpp        | 24 ++++++++++++++++++++++++
 Whisper/Utils/miscUtils.h          |  5 ++++-
 Whisper/Whisper/WhisperContext.cpp |  6 +++---
 4 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/Whisper/Hybrid/HybridContext.cpp b/Whisper/Hybrid/HybridContext.cpp
index 64ed2e6..5811bbd 100644
--- a/Whisper/Hybrid/HybridContext.cpp
+++ b/Whisper/Hybrid/HybridContext.cpp
@@ -176,7 +176,7 @@ HRESULT HybridContext::decode( const int* tokens, const int n_tokens, const int
 		{
 			Tensor Qcur = ml.mulMat( layer.attnQuery.w, cur );
 			if( 0 == il ) Tracing::tensor( "dec-Qcur-0", Qcur );
-			const float scaling = (float)pow( float( (int)n_state ) / (int)n_head, -0.25 );
+			const float scaling = computeScaling( (int)n_state, (int)n_head );
 			ml.addRepeatScale( Qcur, layer.attnQuery.b, scaling );
 			if( 0 == il ) Tracing::tensor( "dec-Qcur-1", Qcur );
 
@@ -241,7 +241,7 @@ HRESULT HybridContext::decode( const int* tokens, const int n_tokens, const int
 		// cross-attention
 		{
 			Tensor Qcur = ml.mulMat( layer.crossAttnQuery.w, cur );
-			ml.addRepeatScale( Qcur, layer.crossAttnQuery.b, (float)pow( float( (int)n_state ) / (int)n_head, -0.25 ) );
+			ml.addRepeatScale( Qcur, layer.crossAttnQuery.b, computeScaling( (int)n_state, (int)n_head ) );
 
 			// Kcross is already scaled
 			const uint32_t len = M * n_state;
diff --git a/Whisper/Utils/miscUtils.cpp b/Whisper/Utils/miscUtils.cpp
index c3f7dd1..6644220 100644
--- a/Whisper/Utils/miscUtils.cpp
+++ b/Whisper/Utils/miscUtils.cpp
@@ -1,5 +1,6 @@
 #include "stdafx.h"
 #include "miscUtils.h"
+#include <cmath>
 
 void setCurrentThreadName( const char* threadName )
 {
@@ -30,4 +31,27 @@ void setCurrentThreadName( const char* threadName )
 	__except( EXCEPTION_EXECUTE_HANDLER )
 	{
 	}
+}
+
+float computeScaling( int mul, int div )
+{
+#ifdef _DEBUG
+	const float ref = (float)std::pow( (double)mul / (double)div, -0.25 );
+#endif
+	// Make int32 vector with both numbers
+	__m128i iv = _mm_cvtsi32_si128( mul );
+	iv = _mm_insert_epi32( iv, div, 1 );
+	// Convert both numbers to FP64
+	__m128d v = _mm_cvtepi32_pd( iv );
+	// Compute mul / div
+	v = _mm_div_sd( v, _mm_unpackhi_pd( v, v ) );
+	// Square root
+	v = _mm_sqrt_sd( v, v );
+	// 4-th root
+	v = _mm_sqrt_sd( v, v );
+	// Invert the value
+	v = _mm_div_sd( _mm_set_sd( 1.0 ), v );
+	// Downcast to FP32, and return the result
+	__m128 f32 = _mm_cvtsd_ss( _mm_setzero_ps(), v );
+	return _mm_cvtss_f32( f32 );
 }
\ No newline at end of file
diff --git a/Whisper/Utils/miscUtils.h b/Whisper/Utils/miscUtils.h
index d665cbc..ff52e21 100644
--- a/Whisper/Utils/miscUtils.h
+++ b/Whisper/Utils/miscUtils.h
@@ -78,4 +78,7 @@ template<class E>
 inline size_t vectorMemoryUse( const std::vector<E>& vec )
 {
 	return sizeof( E ) * vec.capacity();
-}
\ No newline at end of file
+}
+
+// The formula is pow( mul / div, -0.25 )
+float computeScaling( int mul, int div );
\ No newline at end of file
diff --git a/Whisper/Whisper/WhisperContext.cpp b/Whisper/Whisper/WhisperContext.cpp
index e694930..823d207 100644
--- a/Whisper/Whisper/WhisperContext.cpp
+++ b/Whisper/Whisper/WhisperContext.cpp
@@ -358,7 +358,7 @@ Tensor WhisperContext::encode( Whisper::iSpectrogram& spectrogram, const sEncode
 
 		const size_t layersCount = encParams.n_text_layer;
 		const uint32_t stride = encParams.n_state * encParams.n_ctx;
-		const float finalScaling = (float)pow( float( encParams.n_state ) / encParams.n_head, -0.25 );
+		const float finalScaling = computeScaling( (int)encParams.n_state, (int)encParams.n_head );
 		for( size_t i = 0; i < layersCount; i++ )
 		{
 			const LayerDecoder& layer = gpuModel.dec.layers[ i ];
@@ -422,7 +422,7 @@ Tensor WhisperContext::decodeLayer( const Tensor& inpL, size_t il, const sLayerD
 		profiler.setNextTag( "dec.layer.1" );
 		Tensor Qcur = mulMat( layer.attnQuery.w, cur );
 		if( 0 == il ) Tracing::tensor( "dec-Qcur-0", Qcur );
-		const float scaling = (float)pow( float( (int)ldp.n_state ) / (int)ldp.n_head, -0.25 );
+		const float scaling = computeScaling( (int)ldp.n_state, (int)ldp.n_head );
 		addRepeatScale( Qcur, layer.attnQuery.b, scaling );
 		if( 0 == il ) Tracing::tensor( "dec-Qcur-1", Qcur );
 
@@ -494,7 +494,7 @@ Tensor WhisperContext::decodeLayer( const Tensor& inpL, size_t il, const sLayerD
 	{
 		profiler.setNextTag( "dec.layer.7" );
 		Tensor Qcur = mulMat( layer.crossAttnQuery.w, cur );
-		addRepeatScale( Qcur, layer.crossAttnQuery.b, (float)pow( float( (int)ldp.n_state ) / (int)ldp.n_head, -0.25 ) );
+		addRepeatScale( Qcur, layer.crossAttnQuery.b, computeScaling( (int)ldp.n_state, (int)ldp.n_head ) );
 
 		// Kcross is already scaled
 		const uint32_t len = ldp.M * ldp.n_state;
-- 
cgit v1.2.3