CPU performance, SSE vectorization for MEL spectrogram

author: Konstantin <const@const.me> 2023-01-21 18:27:21 +0100
committer: Konstantin <const@const.me> 2023-01-21 18:27:21 +0100
commit: b4a6e7c59a3def6b7212396cdbf5f08b2119ffa8 (patch)
tree: 892f4fc36e2999b31abe11b0e8fdcba1ad5d6d77
parent: 284c76c42582f8f09eae6a29c16cda74a4192f3e (diff)
1 files changed, 104 insertions, 9 deletions
diff --git a/Whisper/Whisper/melSpectrogram.cpp b/Whisper/Whisper/melSpectrogram.cpp
index f297557..468962f 100644
--- a/Whisper/Whisper/melSpectrogram.cpp
+++ b/Whisper/Whisper/melSpectrogram.cpp
@@ -1,6 +1,8 @@
 #include "stdafx.h"
 #include <cmath>
 #include "melSpectrogram.h"
+#define _XM_SSE4_INTRINSICS_
+#include <DirectXMath.h>
 
 namespace Whisper
 {
@@ -39,16 +41,116 @@ namespace
 	// const uint32_t tempBufferSize = FFT_SIZE + tempVectorSizeRecursion( FFT_SIZE );
 	constexpr uint32_t tempBufferSize = 6000;
 
+	// [ a, b, c, d ], [ e, f, g, h ] => [ a+b+c+d, e+f+g+h ]
+	inline  __m128 hadd2( __m128 low, __m128 high )
+	{
+		// [ a, e, b, f ]
+		__m128 a = _mm_unpacklo_ps( low, high );
+		// [ c, g, d, h ]
+		__m128 b = _mm_unpackhi_ps( low, high );
+		// [ a+c, e+g, b+d, f+h ]
+		__m128 r = _mm_add_ps( a, b );
+		// [ b+d, f+h, b+d, f+h ]
+		__m128 tmp = _mm_movehl_ps( r, r );
+		// [ a+c+b+d, e+g+f+h ]
+		return _mm_add_ps( r, tmp );
+	}
+
+	inline __m128 load2( const float* rsi )
+	{
+		return _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
+	}
+	inline void store2( float* rdi, __m128 vec )
+	{
+		_mm_store_sd( (double*)rdi, _mm_castps_pd( vec ) );
+	}
+	inline __m128 loadFloat3( const float* rsi )
+	{
+		__m128 f = load2( rsi );
+		f = _mm_insert_ps( f, _mm_load_ss( rsi + 2 ), 0x20 );
+		return f;
+	}
+	inline __m128 loadPartial( const float* rsi, size_t rem )
+	{
+		assert( rem > 0 && rem < 4 );
+		switch( rem )
+		{
+		case 1:
+			return _mm_load_ss( rsi );
+		case 2:
+			return load2( rsi );
+		case 3:
+			return loadFloat3( rsi );
+		}
+		return _mm_setzero_ps();
+	}
+
 	// naive Discrete Fourier Transform
 	// input is real-valued
 	// output is complex-valued
 	inline void dft( const float* rsi, size_t len, float* rdi )
 	{
+		const size_t lenAligned = len & ( ~(size_t)3 );
+		const size_t remainder = len % 4;
+
+		const double mulScalarBase = ( 2.0 * M_PI ) / (double)(int)len;
+
+		const __m128 nvInitial = _mm_setr_ps( 0, 1, 2, 3 );
+		const __m128 nvInc = _mm_set1_ps( 4 );
+
 		for( size_t k = 0; k < len; k++ )
 		{
+#if 1
+			const __m128 mul = _mm_set1_ps( (float)( mulScalarBase * (int)k ) );
+
+			__m128 nv = nvInitial;
+			__m128 cosine = _mm_setzero_ps();
+			__m128 sine = _mm_setzero_ps();
+			for( size_t n = 0; n < lenAligned; n += 4 )
+			{
+				const __m128 angles = _mm_mul_ps( nv, mul );
+				nv = _mm_add_ps( nv, nvInc );
+				__m128 s, c;
+				// That library function from Windows SDK is way faster than std::sinf/cosf
+				// Especially because we use the version which computes 4 angles at once
+				// Source codes there: https://github.com/microsoft/DirectXMath/blob/dec2022/Inc/DirectXMathVector.inl#L4456-L4512
+				DirectX::XMVectorSinCos( &s, &c, angles );
+
+				// Multiply sin/cos by 4 source values
+				const __m128 source = _mm_loadu_ps( &rsi[ n ] );
+				c = _mm_mul_ps( c, source );
+				s = _mm_mul_ps( s, source );
+
+				// Accumulate in 2 vectors
+				cosine = _mm_add_ps( cosine, c );
+				sine = _mm_sub_ps( sine, s );
+			}
+
+			// Handle the remainder; debugger shows it's always 1, BTW
+			if( 0 != remainder )
+			{
+				const __m128 angles = _mm_mul_ps( nv, mul );
+				__m128 s, c;
+				DirectX::XMVectorSinCos( &s, &c, angles );
+				// loadPartial sets unused lanes to 0..
+				const __m128 source = loadPartial( &rsi[ lenAligned ], remainder );
+				// x * 0.0 == 0.0 ..
+				c = _mm_mul_ps( c, source );
+				s = _mm_mul_ps( s, source );
+				// .. that's why it's fine to accumulate the complete vectors.
+				// Adding or subtracting zero doesn't change the accumulator
+				cosine = _mm_add_ps( cosine, c );
+				sine = _mm_sub_ps( sine, s );
+			}
+
+			// Reduce 2*4 accumulators -> 2 scalars in a single vector
+			const __m128 res = hadd2( cosine, sine );
+			// Store 2 floats, with 1 instruction
+			store2( &rdi[ k * 2 ], res );
+#else
+			// Original scalar version here
 			float re = 0;
 			float im = 0;
-
 			for( int n = 0; n < len; n++ )
 			{
 				float angle = (float)( 2 * M_PI * (int)k * n / len );
@@ -58,6 +160,7 @@ namespace
 
 			rdi[ k * 2 + 0 ] = re;
 			rdi[ k * 2 + 1 ] = im;
+#endif
 		}
 	}
 
@@ -96,14 +199,6 @@ namespace
 		__m128 v = _mm_set_ss( f );
 		return _mm_moveldup_ps( v );
 	}
-	inline __m128 load2( const float* rsi )
-	{
-		return _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
-	}
-	inline void store2( float* rdi, __m128 vec )
-	{
-		_mm_store_sd( (double*)rdi, _mm_castps_pd( vec ) );
-	}
 	// [ x, y ] => [ x, y, x, y ]
 	inline __m128 dup2( __m128 x )
 	{
author	Konstantin <const@const.me>	2023-01-21 18:27:21 +0100
committer	Konstantin <const@const.me>	2023-01-21 18:27:21 +0100
commit	b4a6e7c59a3def6b7212396cdbf5f08b2119ffa8 (patch)
tree	892f4fc36e2999b31abe11b0e8fdcba1ad5d6d77
parent	284c76c42582f8f09eae6a29c16cda74a4192f3e (diff)