Whisper/CPU/DecoderTensors.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

#pragma once
#include <vector>
#include "Tensor.h"
#include "LargeBuffer.h"
#if TENSOR_GGML_COMPAT
#include "../source/ggml.h"
#endif

namespace CpuCompute
{
	// A set of tensors for one decoder's layer
	struct LayerDecoder
	{
		// decoder.blocks.*.attn_ln
		TensorPair attnLn0;
		// decoder.blocks.*.attn.out
		TensorPair attnLn1;
		// decoder.blocks.*.attn.query
		TensorPair attnQuery;
		// decoder.blocks.*.attn.key
		Tensor attnKey;
		// decoder.blocks.*.attn.value
		TensorPair attnValue;
		// decoder.blocks.*.cross_attn_ln
		TensorPair crossAttnLn0;
		// decoder.blocks.*.cross_attn.out
		TensorPair crossAttnLn1;
		// decoder.blocks.*.cross_attn.query
		TensorPair crossAttnQuery;

		// decoder.blocks.*.cross_attn.key
		// Tensor crossAttnKey;
		// decoder.blocks.*.cross_attn.value
		// TensorPair crossAttnValue;

		// decoder.blocks.*.mlp_ln
		TensorPair mlpLn;
		// decoder.blocks.*.mlp.0
		TensorPair mlp0;
		// decoder.blocks.*.mlp.2
		TensorPair mlp1;

#if TENSOR_GGML_COMPAT
		// decoder.blocks.*.attn_ln
		ggml_tensor* attn_ln_0_w;
		ggml_tensor* attn_ln_0_b;

		// decoder.blocks.*.attn.out
		ggml_tensor* attn_ln_1_w;
		ggml_tensor* attn_ln_1_b;

		// decoder.blocks.*.attn.query
		ggml_tensor* attn_q_w;
		ggml_tensor* attn_q_b;

		// decoder.blocks.*.attn.key
		ggml_tensor* attn_k_w;

		// decoder.blocks.*.attn.value
		ggml_tensor* attn_v_w;
		ggml_tensor* attn_v_b;

		// decoder.blocks.*.cross_attn_ln
		ggml_tensor* cross_attn_ln_0_w;
		ggml_tensor* cross_attn_ln_0_b;

		// decoder.blocks.*.cross_attn.out
		ggml_tensor* cross_attn_ln_1_w;
		ggml_tensor* cross_attn_ln_1_b;

		// decoder.blocks.*.cross_attn.query
		ggml_tensor* cross_attn_q_w;
		ggml_tensor* cross_attn_q_b;

		// decoder.blocks.*.mlp_ln
		ggml_tensor* mlp_ln_w;
		ggml_tensor* mlp_ln_b;

		// decoder.blocks.*.mlp.0
		ggml_tensor* mlp_0_w;
		ggml_tensor* mlp_0_b;

		// decoder.blocks.*.mlp.2
		ggml_tensor* mlp_1_w;
		ggml_tensor* mlp_1_b;
#endif
	};

	struct DecoderTensors
	{
		// decoder.positional_embedding
		Tensor positionalEmbedding;

		// decoder.token_embedding
		Tensor tokenEmbedding;

		// decoder.ln
		TensorPair ln;
		// A vector of layers
		std::vector<LayerDecoder> layers;

		void setMemoryBuffer( LargeBuffer&& mem ) noexcept
		{
			memory = std::move( mem );
#if TENSOR_GGML_COMPAT
			makeCompatTensors();
#endif
		}

#if TENSOR_GGML_COMPAT
		void makeCompatTensors();

		// decoder.positional_embedding
		ggml_tensor* d_pe; // DD

		// decoder.token_embedding
		ggml_tensor* d_te; // DD

		// decoder.ln
		ggml_tensor* d_ln_w; // DD
		ggml_tensor* d_ln_b; // DD
#endif

	private:
		// A smart pointer which owns the memory for all the above tensors
		LargeBuffer memory;
#if TENSOR_GGML_COMPAT
		std::vector<ggml_tensor> ggml;
#endif
	};
}