Whisper/CPU/TensorCpu.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401

#include <stdafx.h>
#include <atomic>
#include "Tensor.h"
using namespace CpuCompute;

#if TENSOR_INTERNAL_ALLOC
namespace
{
	// This structure is immediately before the payload of every tensor which has an internally-allocated memory buffer
	class alignas( 32 ) sTensorMemoryHeader
	{
		std::atomic_ptrdiff_t refCounter;
	public:
		// Reset the counter to the specified value
		void reset( ptrdiff_t rc )
		{
			refCounter = rc;
		}
		// Increment the ref.counter
		void increment()
		{
			refCounter++;
		}
		// Decrement the ref.counter, and return true if it reached zero as the result
		bool decrement()
		{
			ptrdiff_t val = --refCounter;
			assert( val >= 0 );
			return 0 == val;
		}
	};

	inline sTensorMemoryHeader* getMemBlockHeader( void* pv )
	{
		assert( nullptr != pv );
		uint8_t* pb = (uint8_t*)pv;
		static_assert( sizeof( sTensorMemoryHeader ) == 32 );
		return (sTensorMemoryHeader*)( pb - sizeof( sTensorMemoryHeader ) );
	}

	inline void releaseBlock( sTensorMemoryHeader* pointer )
	{
		assert( nullptr != pointer );
		_aligned_free( pointer );
	}

	inline void* allocateBlock( size_t cb, ptrdiff_t initialRefCounter = 1 )
	{
		cb += sizeof( sTensorMemoryHeader );
		void* pv = _aligned_malloc( cb, 32 );
		if( nullptr == pv )
			return nullptr;

		sTensorMemoryHeader* header = (sTensorMemoryHeader*)pv;
		header->reset( initialRefCounter );
		return ( (uint8_t*)pv ) + sizeof( sTensorMemoryHeader );
	}
}

void Tensor::deallocate()
{
	if( ownsMemory && nullptr != m_data )
	{
		sTensorMemoryHeader* const header = getMemBlockHeader( m_data );
		if( header->decrement() )
		{
			// This tensor is the last one which had a reference to that block of memory
			// Release the memory back to the heap
			releaseBlock( header );
		}
	}
	ownsMemory = false;

	TensorShape::setZero();
	m_data = nullptr;
	m_type = (eDataType)0xFF;
}
#endif

Tensor::Tensor( const Tensor& that )
{
	store( ne, that.sizeVec() );
	store( nb, that.stridesVec() );
	m_data = that.m_data;
	m_type = that.m_type;
#if TENSOR_INTERNAL_ALLOC
	if( that.ownsMemory && nullptr != m_data )
	{
		getMemBlockHeader( m_data )->increment();
		ownsMemory = true;
	}
	else
		ownsMemory = false;
#endif
}

Tensor::Tensor( Tensor&& that ) noexcept
{
	store( ne, that.sizeVec() );
	store( nb, that.stridesVec() );
	m_data = that.m_data;
	m_type = that.m_type;
#if TENSOR_INTERNAL_ALLOC
	ownsMemory = that.ownsMemory;
	that.ownsMemory = false;
#endif
	that.m_data = nullptr;
}

void Tensor::operator=( const Tensor& that )
{
	assert( this != &that );
#if TENSOR_INTERNAL_ALLOC
	deallocate();
#endif

	store( ne, that.sizeVec() );
	store( nb, that.stridesVec() );
	m_data = that.m_data;
	m_type = that.m_type;
#if TENSOR_INTERNAL_ALLOC
	if( that.ownsMemory && nullptr != m_data )
	{
		getMemBlockHeader( m_data )->increment();
		ownsMemory = true;
	}
	else
		ownsMemory = false;
#endif
}

void Tensor::operator=( Tensor&& that ) noexcept
{
	assert( this != &that );
#if TENSOR_INTERNAL_ALLOC
	deallocate();
#endif
	store( ne, that.sizeVec() );
	store( nb, that.stridesVec() );
	m_data = that.m_data;
	m_type = that.m_type;
	that.m_data = nullptr;
#if TENSOR_INTERNAL_ALLOC
	ownsMemory = that.ownsMemory;
	that.ownsMemory = false;
#endif
}

HRESULT Tensor::create( eDataType type, const std::array<uint32_t, 4>& sizeElements, iMemoryAllocator* alloc )
{
	const size_t len = (size_t)sizeElements[ 0 ] * sizeElements[ 1 ] * sizeElements[ 2 ] * sizeElements[ 3 ];
	const size_t cbElement = DirectCompute::elementSize( type );
	const size_t cb = len * cbElement;

#if TENSOR_INTERNAL_ALLOC
	deallocate();
#endif

	store( ne, load( sizeElements ) );
	TensorShape::setDenseStrides();
	this->m_type = type;

	if( nullptr != alloc )
	{
#if TENSOR_INTERNAL_ALLOC
		ownsMemory = false;
#endif
		m_data = alloc->allocate( cb, 32 );
		if( nullptr == m_data )
			return E_OUTOFMEMORY;
		return S_OK;
	}
	else
	{
#if TENSOR_INTERNAL_ALLOC
		m_data = allocateBlock( cb, 1 );
		if( nullptr == m_data )
			return E_OUTOFMEMORY;
		ownsMemory = true;
		return S_OK;
#else
		return E_POINTER;
#endif
	}
}

namespace
{
	static HRESULT arrayFromList( std::array<uint32_t, 4>& arr, std::initializer_list<uint32_t> list )
	{
		const size_t dims = list.size();
		if( dims == 0 || dims > 4 )
			return E_INVALIDARG;

		for( size_t i = 0; i < dims; i++ )
		{
			uint32_t u = list.begin()[ i ];
			if( u == 0 )
				return E_INVALIDARG;
			arr[ i ] = u;
		}

		for( size_t i = dims; i < 4; i++ )
			arr[ i ] = 1;

		return S_OK;
	}
}

HRESULT Tensor::create( eDataType type, std::initializer_list<uint32_t> sizeElements, iMemoryAllocator* alloc )
{
	std::array<uint32_t, 4> arr;
	CHECK( arrayFromList( arr, sizeElements ) );

	return create( type, arr, alloc );
}

Tensor::Tensor( void* pointer, eDataType type, std::initializer_list<uint32_t> size )
{
	if( nullptr == pointer )
		throw E_POINTER;
	check( arrayFromList( ne, size ) );
	TensorShape::setDenseStrides();
	m_data = pointer;
	m_type = type;
#if TENSOR_INTERNAL_ALLOC
	ownsMemory = false;
#endif
}

Tensor::Tensor( void* pointer, eDataType type, uint32_t length ) noexcept
{
	// size = [ length, 1, 1, 1 ]
	const __m128i one = _mm_set1_epi32( 1 );
	__m128i v = _mm_insert_epi32( one, (int)length, 0 );
	store( ne, v );
	// stride = [ 1, length, length, length ]
	v = _mm_shuffle_epi32( v, _MM_SHUFFLE( 0, 0, 0, 1 ) );
	store( nb, v );

	m_data = pointer;
	m_type = type;
#if TENSOR_INTERNAL_ALLOC
	ownsMemory = false;
#endif
}

Tensor Tensor::fromData( void* pointer, eDataType type, uint32_t length )
{
	HRESULT hr = E_UNEXPECTED;
	if( nullptr != pointer )
	{
		if( 0 != length )
			return Tensor{ pointer, type, length };
		else
			hr = E_INVALIDARG;
	}
	else
		hr = E_POINTER;
	throw hr;
}

HRESULT Tensor::attach( void* pointer, eDataType type, std::initializer_list<uint32_t> sizeElements )
{
	if( nullptr == pointer )
		return E_POINTER;

	std::array<uint32_t, 4> arr;
	CHECK( arrayFromList( arr, sizeElements ) );

#if TENSOR_INTERNAL_ALLOC
	deallocate();
#endif
	store( ne, load( arr ) );
	TensorShape::setDenseStrides();

	m_data = pointer;
	this->m_type = type;
#if TENSOR_INTERNAL_ALLOC
	ownsMemory = false;
#endif
	return S_OK;
}

Tensor Tensor::reshape3d( uint32_t ne0, uint32_t ne1, uint32_t ne2 ) const
{
	if( !isContinuous() )
		throw E_NOTIMPL;
	if( countElements() != ne0 * ne1 * ne2 )
		throw E_INVALIDARG;

	Tensor res = *this;
	res.ne = { ne0, ne1, ne2, 1 };
	res.setDenseStrides();
	return res;
}

#if TENSOR_GGML_COMPAT
static const __m128i s_maskAlignment16 = _mm_set1_epi64x( 1 );
static const __m128i s_maskAlignment32 = _mm_set1_epi64x( 3 );

bool isAlignedProperly( __m128i r0, __m128i r1, __m128i mask )
{
	__m128i test = _mm_or_si128( r0, r1 );
	return (bool)_mm_testz_si128( test, mask );
}

Tensor::Tensor( const ggml_tensor* ggml )
{
	store( ne, load16( ggml->ne ) );

	__m128i r0 = load16( (const int*)&ggml->nb[ 0 ] );
	__m128i r1 = load16( (const int*)&ggml->nb[ 2 ] );
	// Divide from bytes into elements by right-shifting the 64-bit integers in these vectors
	switch( ggml->type )
	{
	case GGML_TYPE_F16:
		assert( isAlignedProperly( r0, r1, s_maskAlignment16 ) );
		r0 = _mm_srli_epi64( r0, 1 );
		r1 = _mm_srli_epi64( r1, 1 );
		m_type = eDataType::FP16;
		break;

	case GGML_TYPE_F32:
		assert( isAlignedProperly( r0, r1, s_maskAlignment32 ) );
		r0 = _mm_srli_epi64( r0, 2 );
		r1 = _mm_srli_epi64( r1, 2 );
		m_type = eDataType::FP32;
		break;

	case GGML_TYPE_I32:
		assert( isAlignedProperly( r0, r1, s_maskAlignment32 ) );
		r0 = _mm_srli_epi64( r0, 2 );
		r1 = _mm_srli_epi64( r1, 2 );
		m_type = eDataType::U32;
		break;

	default:
		throw E_INVALIDARG;
	}
	// downcast uint64_t into uint32_t in a single vector
	r0 = _mm_shuffle_epi32( r0, _MM_SHUFFLE( 3, 3, 2, 0 ) );
	r1 = _mm_shuffle_epi32( r1, _MM_SHUFFLE( 2, 0, 3, 3 ) );
	store( nb, _mm_blend_epi16( r0, r1, 0b11110000 ) );

	m_data = ggml->data;
}

ggml_tensor Tensor::ggml() const
{
	ggml_tensor res;
	memset( &res, 0, sizeof( ggml_tensor ) );

	const __m128i size = sizeVec();
	store16( res.ne, size );

	const __m128i one = _mm_set1_epi32( 1 );
	const uint32_t maskOnes = (uint32_t)_mm_movemask_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( size, one ) ) );
	const uint32_t maskNotOnes = maskOnes ^ 0b1111;
	unsigned long idx;
	if( _BitScanReverse( &idx, maskNotOnes ) )
		res.n_dims = (int)idx + 1;
	else
		res.n_dims = 0;

	const __m128i strides = stridesVec();
	// Upcast strides from u32 to u64
	const __m128i zero = _mm_setzero_si128();
	__m128i r0 = _mm_unpacklo_epi32( strides, zero );
	__m128i r1 = _mm_unpackhi_epi32( strides, zero );
	// Scale from elements into bytes with left shift vector instructions
	switch( m_type )
	{
	case eDataType::FP16:
		r0 = _mm_slli_epi64( r0, 1 );
		r1 = _mm_slli_epi64( r1, 1 );
		res.type = GGML_TYPE_F16;
		break;
	case eDataType::FP32:
		r0 = _mm_slli_epi64( r0, 2 );
		r1 = _mm_slli_epi64( r1, 2 );
		res.type = GGML_TYPE_F32;
		break;
	case eDataType::U32:
		r0 = _mm_slli_epi64( r0, 2 );
		r1 = _mm_slli_epi64( r1, 2 );
		res.type = GGML_TYPE_I32;
		break;
	default:
		throw OLE_E_BLANK;
	}

	store16( &res.nb[ 0 ], r0 );
	store16( &res.nb[ 2 ], r1 );

	res.data = m_data;
	return res;
}

GgmlTensorView::GgmlTensorView( const Tensor& t ) : tensor( t.ggml() ) {}
#endif