Source codes

author: Konstantin <const@const.me> 2023-01-16 14:52:43 +0100
committer: Konstantin <const@const.me> 2023-01-16 14:52:43 +0100
commit: 8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree: 714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/CPU/TensorCpu.cpp
parent: 990a8d0dbaefc996244097397259e92758b15cce (diff)
1 files changed, 401 insertions, 0 deletions
diff --git a/Whisper/CPU/TensorCpu.cpp b/Whisper/CPU/TensorCpu.cpp
new file mode 100644
index 0000000..dc34464
--- /dev/null
+++ b/Whisper/CPU/TensorCpu.cpp
@@ -0,0 +1,401 @@
+#include <stdafx.h>
+#include <atomic>
+#include "Tensor.h"
+using namespace CpuCompute;
+
+#if TENSOR_INTERNAL_ALLOC
+namespace
+{
+	// This structure is immediately before the payload of every tensor which has an internally-allocated memory buffer
+	class alignas( 32 ) sTensorMemoryHeader
+	{
+		std::atomic_ptrdiff_t refCounter;
+	public:
+		// Reset the counter to the specified value
+		void reset( ptrdiff_t rc )
+		{
+			refCounter = rc;
+		}
+		// Increment the ref.counter
+		void increment()
+		{
+			refCounter++;
+		}
+		// Decrement the ref.counter, and return true if it reached zero as the result
+		bool decrement()
+		{
+			ptrdiff_t val = --refCounter;
+			assert( val >= 0 );
+			return 0 == val;
+		}
+	};
+
+	inline sTensorMemoryHeader* getMemBlockHeader( void* pv )
+	{
+		assert( nullptr != pv );
+		uint8_t* pb = (uint8_t*)pv;
+		static_assert( sizeof( sTensorMemoryHeader ) == 32 );
+		return (sTensorMemoryHeader*)( pb - sizeof( sTensorMemoryHeader ) );
+	}
+
+	inline void releaseBlock( sTensorMemoryHeader* pointer )
+	{
+		assert( nullptr != pointer );
+		_aligned_free( pointer );
+	}
+
+	inline void* allocateBlock( size_t cb, ptrdiff_t initialRefCounter = 1 )
+	{
+		cb += sizeof( sTensorMemoryHeader );
+		void* pv = _aligned_malloc( cb, 32 );
+		if( nullptr == pv )
+			return nullptr;
+
+		sTensorMemoryHeader* header = (sTensorMemoryHeader*)pv;
+		header->reset( initialRefCounter );
+		return ( (uint8_t*)pv ) + sizeof( sTensorMemoryHeader );
+	}
+}
+
+void Tensor::deallocate()
+{
+	if( ownsMemory && nullptr != m_data )
+	{
+		sTensorMemoryHeader* const header = getMemBlockHeader( m_data );
+		if( header->decrement() )
+		{
+			// This tensor is the last one which had a reference to that block of memory
+			// Release the memory back to the heap
+			releaseBlock( header );
+		}
+	}
+	ownsMemory = false;
+
+	TensorShape::setZero();
+	m_data = nullptr;
+	m_type = (eDataType)0xFF;
+}
+#endif
+
+Tensor::Tensor( const Tensor& that )
+{
+	store( ne, that.sizeVec() );
+	store( nb, that.stridesVec() );
+	m_data = that.m_data;
+	m_type = that.m_type;
+#if TENSOR_INTERNAL_ALLOC
+	if( that.ownsMemory && nullptr != m_data )
+	{
+		getMemBlockHeader( m_data )->increment();
+		ownsMemory = true;
+	}
+	else
+		ownsMemory = false;
+#endif
+}
+
+Tensor::Tensor( Tensor&& that ) noexcept
+{
+	store( ne, that.sizeVec() );
+	store( nb, that.stridesVec() );
+	m_data = that.m_data;
+	m_type = that.m_type;
+#if TENSOR_INTERNAL_ALLOC
+	ownsMemory = that.ownsMemory;
+	that.ownsMemory = false;
+#endif
+	that.m_data = nullptr;
+}
+
+void Tensor::operator=( const Tensor& that )
+{
+	assert( this != &that );
+#if TENSOR_INTERNAL_ALLOC
+	deallocate();
+#endif
+
+	store( ne, that.sizeVec() );
+	store( nb, that.stridesVec() );
+	m_data = that.m_data;
+	m_type = that.m_type;
+#if TENSOR_INTERNAL_ALLOC
+	if( that.ownsMemory && nullptr != m_data )
+	{
+		getMemBlockHeader( m_data )->increment();
+		ownsMemory = true;
+	}
+	else
+		ownsMemory = false;
+#endif
+}
+
+void Tensor::operator=( Tensor&& that ) noexcept
+{
+	assert( this != &that );
+#if TENSOR_INTERNAL_ALLOC
+	deallocate();
+#endif
+	store( ne, that.sizeVec() );
+	store( nb, that.stridesVec() );
+	m_data = that.m_data;
+	m_type = that.m_type;
+	that.m_data = nullptr;
+#if TENSOR_INTERNAL_ALLOC
+	ownsMemory = that.ownsMemory;
+	that.ownsMemory = false;
+#endif
+}
+
+HRESULT Tensor::create( eDataType type, const std::array<uint32_t, 4>& sizeElements, iMemoryAllocator* alloc )
+{
+	const size_t len = (size_t)sizeElements[ 0 ] * sizeElements[ 1 ] * sizeElements[ 2 ] * sizeElements[ 3 ];
+	const size_t cbElement = DirectCompute::elementSize( type );
+	const size_t cb = len * cbElement;
+
+#if TENSOR_INTERNAL_ALLOC
+	deallocate();
+#endif
+
+	store( ne, load( sizeElements ) );
+	TensorShape::setDenseStrides();
+	this->m_type = type;
+
+	if( nullptr != alloc )
+	{
+#if TENSOR_INTERNAL_ALLOC
+		ownsMemory = false;
+#endif
+		m_data = alloc->allocate( cb, 32 );
+		if( nullptr == m_data )
+			return E_OUTOFMEMORY;
+		return S_OK;
+	}
+	else
+	{
+#if TENSOR_INTERNAL_ALLOC
+		m_data = allocateBlock( cb, 1 );
+		if( nullptr == m_data )
+			return E_OUTOFMEMORY;
+		ownsMemory = true;
+		return S_OK;
+#else
+		return E_POINTER;
+#endif
+	}
+}
+
+namespace
+{
+	static HRESULT arrayFromList( std::array<uint32_t, 4>& arr, std::initializer_list<uint32_t> list )
+	{
+		const size_t dims = list.size();
+		if( dims == 0 || dims > 4 )
+			return E_INVALIDARG;
+
+		for( size_t i = 0; i < dims; i++ )
+		{
+			uint32_t u = list.begin()[ i ];
+			if( u == 0 )
+				return E_INVALIDARG;
+			arr[ i ] = u;
+		}
+
+		for( size_t i = dims; i < 4; i++ )
+			arr[ i ] = 1;
+
+		return S_OK;
+	}
+}
+
+HRESULT Tensor::create( eDataType type, std::initializer_list<uint32_t> sizeElements, iMemoryAllocator* alloc )
+{
+	std::array<uint32_t, 4> arr;
+	CHECK( arrayFromList( arr, sizeElements ) );
+
+	return create( type, arr, alloc );
+}
+
+Tensor::Tensor( void* pointer, eDataType type, std::initializer_list<uint32_t> size )
+{
+	if( nullptr == pointer )
+		throw E_POINTER;
+	check( arrayFromList( ne, size ) );
+	TensorShape::setDenseStrides();
+	m_data = pointer;
+	m_type = type;
+#if TENSOR_INTERNAL_ALLOC
+	ownsMemory = false;
+#endif
+}
+
+Tensor::Tensor( void* pointer, eDataType type, uint32_t length ) noexcept
+{
+	// size = [ length, 1, 1, 1 ]
+	const __m128i one = _mm_set1_epi32( 1 );
+	__m128i v = _mm_insert_epi32( one, (int)length, 0 );
+	store( ne, v );
+	// stride = [ 1, length, length, length ]
+	v = _mm_shuffle_epi32( v, _MM_SHUFFLE( 0, 0, 0, 1 ) );
+	store( nb, v );
+
+	m_data = pointer;
+	m_type = type;
+#if TENSOR_INTERNAL_ALLOC
+	ownsMemory = false;
+#endif
+}
+
+Tensor Tensor::fromData( void* pointer, eDataType type, uint32_t length )
+{
+	HRESULT hr = E_UNEXPECTED;
+	if( nullptr != pointer )
+	{
+		if( 0 != length )
+			return Tensor{ pointer, type, length };
+		else
+			hr = E_INVALIDARG;
+	}
+	else
+		hr = E_POINTER;
+	throw hr;
+}
+
+HRESULT Tensor::attach( void* pointer, eDataType type, std::initializer_list<uint32_t> sizeElements )
+{
+	if( nullptr == pointer )
+		return E_POINTER;
+
+	std::array<uint32_t, 4> arr;
+	CHECK( arrayFromList( arr, sizeElements ) );
+
+#if TENSOR_INTERNAL_ALLOC
+	deallocate();
+#endif
+	store( ne, load( arr ) );
+	TensorShape::setDenseStrides();
+
+	m_data = pointer;
+	this->m_type = type;
+#if TENSOR_INTERNAL_ALLOC
+	ownsMemory = false;
+#endif
+	return S_OK;
+}
+
+Tensor Tensor::reshape3d( uint32_t ne0, uint32_t ne1, uint32_t ne2 ) const
+{
+	if( !isContinuous() )
+		throw E_NOTIMPL;
+	if( countElements() != ne0 * ne1 * ne2 )
+		throw E_INVALIDARG;
+
+	Tensor res = *this;
+	res.ne = { ne0, ne1, ne2, 1 };
+	res.setDenseStrides();
+	return res;
+}
+
+#if TENSOR_GGML_COMPAT
+static const __m128i s_maskAlignment16 = _mm_set1_epi64x( 1 );
+static const __m128i s_maskAlignment32 = _mm_set1_epi64x( 3 );
+
+bool isAlignedProperly( __m128i r0, __m128i r1, __m128i mask )
+{
+	__m128i test = _mm_or_si128( r0, r1 );
+	return (bool)_mm_testz_si128( test, mask );
+}
+
+Tensor::Tensor( const ggml_tensor* ggml )
+{
+	store( ne, load16( ggml->ne ) );
+
+	__m128i r0 = load16( (const int*)&ggml->nb[ 0 ] );
+	__m128i r1 = load16( (const int*)&ggml->nb[ 2 ] );
+	// Divide from bytes into elements by right-shifting the 64-bit integers in these vectors
+	switch( ggml->type )
+	{
+	case GGML_TYPE_F16:
+		assert( isAlignedProperly( r0, r1, s_maskAlignment16 ) );
+		r0 = _mm_srli_epi64( r0, 1 );
+		r1 = _mm_srli_epi64( r1, 1 );
+		m_type = eDataType::FP16;
+		break;
+
+	case GGML_TYPE_F32:
+		assert( isAlignedProperly( r0, r1, s_maskAlignment32 ) );
+		r0 = _mm_srli_epi64( r0, 2 );
+		r1 = _mm_srli_epi64( r1, 2 );
+		m_type = eDataType::FP32;
+		break;
+
+	case GGML_TYPE_I32:
+		assert( isAlignedProperly( r0, r1, s_maskAlignment32 ) );
+		r0 = _mm_srli_epi64( r0, 2 );
+		r1 = _mm_srli_epi64( r1, 2 );
+		m_type = eDataType::U32;
+		break;
+
+	default:
+		throw E_INVALIDARG;
+	}
+	// downcast uint64_t into uint32_t in a single vector
+	r0 = _mm_shuffle_epi32( r0, _MM_SHUFFLE( 3, 3, 2, 0 ) );
+	r1 = _mm_shuffle_epi32( r1, _MM_SHUFFLE( 2, 0, 3, 3 ) );
+	store( nb, _mm_blend_epi16( r0, r1, 0b11110000 ) );
+
+	m_data = ggml->data;
+}
+
+ggml_tensor Tensor::ggml() const
+{
+	ggml_tensor res;
+	memset( &res, 0, sizeof( ggml_tensor ) );
+
+	const __m128i size = sizeVec();
+	store16( res.ne, size );
+
+	const __m128i one = _mm_set1_epi32( 1 );
+	const uint32_t maskOnes = (uint32_t)_mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( size, one ) ) );
+	const uint32_t maskNotOnes = maskOnes ^ 0b1111;
+	unsigned long idx;
+	if( _BitScanReverse( &idx, maskNotOnes ) )
+		res.n_dims = (int)idx + 1;
+	else
+		res.n_dims = 0;
+
+	const __m128i strides = stridesVec();
+	// Upcast strides from u32 to u64
+	const __m128i zero = _mm_setzero_si128();
+	__m128i r0 = _mm_unpacklo_epi32( strides, zero );
+	__m128i r1 = _mm_unpackhi_epi32( strides, zero );
+	// Scale from elements into bytes with left shift vector instructions
+	switch( m_type )
+	{
+	case eDataType::FP16:
+		r0 = _mm_slli_epi64( r0, 1 );
+		r1 = _mm_slli_epi64( r1, 1 );
+		res.type = GGML_TYPE_F16;
+		break;
+	case eDataType::FP32:
+		r0 = _mm_slli_epi64( r0, 2 );
+		r1 = _mm_slli_epi64( r1, 2 );
+		res.type = GGML_TYPE_F32;
+		break;
+	case eDataType::U32:
+		r0 = _mm_slli_epi64( r0, 2 );
+		r1 = _mm_slli_epi64( r1, 2 );
+		res.type = GGML_TYPE_I32;
+		break;
+	default:
+		throw OLE_E_BLANK;
+	}
+
+	store16( &res.nb[ 0 ], r0 );
+	store16( &res.nb[ 2 ], r1 );
+
+	res.data = m_data;
+	return res;
+}
+
+GgmlTensorView::GgmlTensorView( const Tensor& t ) : tensor( t.ggml() ) {}
+#endif
+\ No newline at end of file
author	Konstantin <const@const.me>	2023-01-16 14:52:43 +0100
committer	Konstantin <const@const.me>	2023-01-16 14:52:43 +0100
commit	8c4603c73675958efc960fbd4bb599a2909d106a (patch)
tree	714dc6fc9a1672d5fd7f89676b97e10959662abc /Whisper/CPU/TensorCpu.cpp
parent	990a8d0dbaefc996244097397259e92758b15cce (diff)