diff options
| author | Konstantin <const@const.me> | 2023-01-23 20:28:59 +0100 |
|---|---|---|
| committer | Konstantin <const@const.me> | 2023-01-23 20:28:59 +0100 |
| commit | 15dbcacdbc5db68c1ea86bb330d07ec70de75af6 (patch) | |
| tree | 7740da539ecd498ad143b733a50fa5043658e9cd | |
| parent | d3fe947eee55ea149c55c4f3a83ea285f9c0f5ba (diff) | |
Minor, micro-optimization
| -rw-r--r-- | Whisper/ML/Context.ops.cpp | 17 | ||||
| -rw-r--r-- | Whisper/ML/TensorShape.h | 4 | ||||
| -rw-r--r-- | Whisper/Utils/miscUtils.h | 20 |
3 files changed, 26 insertions, 15 deletions
diff --git a/Whisper/ML/Context.ops.cpp b/Whisper/ML/Context.ops.cpp index a94497e..f6309f0 100644 --- a/Whisper/ML/Context.ops.cpp +++ b/Whisper/ML/Context.ops.cpp @@ -84,9 +84,20 @@ Tensor __declspec( noinline ) MlContext::view2d( const Tensor& a, uint32_t ne0, Tensor MlContext::transpose( const Tensor& a ) { - Tensor result = a; - std::swap( result.ne[ 0 ], result.ne[ 1 ] ); - std::swap( result.nb[ 0 ], result.nb[ 1 ] ); + Tensor result; + + // A magic number for _mm_shuffle_epi32 SSE2 instruction to swap two lower int32 lanes in a vector + constexpr int swapXy = _MM_SHUFFLE( 3, 2, 0, 1 ); + + __m128i v = a.sizeVec(); + v = _mm_shuffle_epi32( v, swapXy ); + store( result.ne, v ); + + v = a.stridesVec(); + v = _mm_shuffle_epi32( v, swapXy ); + store( result.nb, v ); + + result.setGpuViews( a, a ); return result; } diff --git a/Whisper/ML/TensorShape.h b/Whisper/ML/TensorShape.h index 473b0c9..5749764 100644 --- a/Whisper/ML/TensorShape.h +++ b/Whisper/ML/TensorShape.h @@ -28,11 +28,11 @@ namespace DirectCompute HRESULT create( const ggml_tensor& ggml ); TensorShape( const ggml_tensor& ggml ); - __m128i sizeVec() const + __m128i __vectorcall sizeVec() const { return load( ne ); } - __m128i stridesVec() const + __m128i __vectorcall stridesVec() const { return load( nb ); } diff --git a/Whisper/Utils/miscUtils.h b/Whisper/Utils/miscUtils.h index ff52e21..77f04e9 100644 --- a/Whisper/Utils/miscUtils.h +++ b/Whisper/Utils/miscUtils.h @@ -10,48 +10,48 @@ inline void check( HRESULT hr ) throw hr; } -inline __m128i load16( const int* rsi ) +inline __m128i __vectorcall load16( const int* rsi ) { return _mm_loadu_si128( ( const __m128i* )rsi ); } -inline __m128i load16( const uint32_t* rsi ) +inline __m128i __vectorcall load16( const uint32_t* rsi ) { return _mm_loadu_si128( ( const __m128i* )rsi ); } -inline __m128i load( const std::array<uint32_t, 4>& arr ) +inline __m128i __vectorcall load( const std::array<uint32_t, 4>& arr ) { return load16( arr.data() ); } -inline void store16( void* rdi, __m128i v ) +inline void __vectorcall store16( void* rdi, __m128i v ) { _mm_storeu_si128( ( __m128i* )rdi, v ); } -inline void store12( void* rdi, __m128i v ) +inline void __vectorcall store12( void* rdi, __m128i v ) { _mm_storel_epi64( ( __m128i* )rdi, v ); ( (int*)rdi )[ 2 ] = _mm_extract_epi32( v, 2 ); } -inline void store( std::array<uint32_t, 4>& arr, __m128i v ) +inline void __vectorcall store( std::array<uint32_t, 4>& arr, __m128i v ) { store16( arr.data(), v ); } -inline bool vectorEqual( __m128i a, __m128i b ) +inline bool __vectorcall vectorEqual( __m128i a, __m128i b ) { __m128i xx = _mm_xor_si128( a, b ); return (bool)_mm_testz_si128( xx, xx ); } -inline __m128i setLow_size( size_t low ) +inline __m128i __vectorcall setLow_size( size_t low ) { return _mm_cvtsi64_si128( (int64_t)low ); } -inline __m128i setr_size( size_t low, size_t high ) +inline __m128i __vectorcall setr_size( size_t low, size_t high ) { __m128i v = setLow_size( low ); v = _mm_insert_epi64( v, (int64_t)high, 1 ); return v; } -inline __m128i setHigh_size( size_t high ) +inline __m128i __vectorcall setHigh_size( size_t high ) { __m128i v = _mm_setzero_si128(); v = _mm_insert_epi64( v, (int64_t)high, 1 ); |
