1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
#pragma once
#include <stdint.h>
#include <assert.h>
namespace Whisper
{
enum struct eModelImplementation : uint32_t
{
// GPGPU implementation based on Direct3D 11.0 compute shaders
GPU = 1,
// A hybrid implementation which uses DirectCompute for encode, and decodes on CPU
// Not implemented in the published builds of the DLL. To enable, change BUILD_HYBRID_VERSION macro to 1
Hybrid = 2,
// A reference implementation which uses the original GGML CPU-running code
// Not implemented in the published builds of the DLL. To enable, change BUILD_BOTH_VERSIONS macro to 1
Reference = 3,
};
// Timespan structure decomposed into fields
struct sTimeSpanFields
{
uint32_t days;
uint8_t hours, minutes, seconds;
uint32_t ticks;
sTimeSpanFields( uint64_t tt )
{
ticks = (uint32_t)( tt % 10'000'000 );
tt /= 10'000'000;
seconds = (uint8_t)( tt % 60 );
tt /= 60;
minutes = (uint8_t)( tt % 60 );
tt /= 60;
hours = (uint8_t)( tt % 24 );
tt /= 24;
days = (uint32_t)tt;
}
};
// C++ equivalent of System.Timespan C# structure
struct sTimeSpan
{
// The value is expressed in 100-nanoseconds ticks: compatible with System.Timespan, FILETIME, and many other things
uint64_t ticks;
operator sTimeSpanFields() const
{
return sTimeSpanFields{ ticks };
}
void operator=( uint64_t tt )
{
ticks = tt;
}
void operator=( int64_t tt )
{
assert( tt >= 0 );
ticks = (uint64_t)tt;
}
};
// Start and end times of the segment or token, expressed in 100-nanosecond ticks
struct sTimeInterval
{
sTimeSpan begin, end;
};
// Segment data
struct sSegment
{
// Segment text, null-terminated, and probably UTF-8 encoded
const char* text;
// Start and end times of the segment
sTimeInterval time;
// These two integers define the slice of the tokens in this segment, in the array returned by iTranscribeResult.getTokens method
uint32_t firstToken, countTokens;
};
enum eTokenFlags : uint32_t
{
None = 0,
Special = 1,
};
inline bool operator &( eTokenFlags a, eTokenFlags b )
{
return 0 != ( (uint32_t)a & (uint32_t)b );
}
// Token data
struct sToken
{
// Token text, null-terminated, and usually UTF-8 encoded.
// I think for Chinese language the models sometimes outputs invalid UTF8 strings here, Unicode code points can be split between adjacent tokens in the same segment
// More info: https://github.com/ggerganov/whisper.cpp/issues/399
const char* text;
// Start and end times of the token
sTimeInterval time;
// Probability of the token
float probability;
// Probability of the timestamp token
float probabilityTimestamp;
// Sum of probabilities of all timestamp tokens
float ptsum;
// Voice length of the token
float vlen;
// Token id
int id;
eTokenFlags flags;
};
struct sTranscribeLength
{
uint32_t countSegments, countTokens;
};
enum struct eResultFlags : uint32_t
{
None = 0,
// Return individual tokens in addition to the segments
Tokens = 1,
// Return timestamps
Timestamps = 2,
// Create a new COM object for the results.
// Without this flag, the context returns a pointer to the COM object stored in the context.
// The content of that object is replaced every time you call iContext.getResults method
NewObject = 0x100,
};
inline eResultFlags operator |( eResultFlags a, eResultFlags b )
{
return (eResultFlags)( (uint32_t)a | (uint32_t)b );
}
inline bool operator &( eResultFlags a, eResultFlags b )
{
return 0 != ( (uint32_t)a & (uint32_t)b );
}
// Output value for iContext.detectSpeaker method
enum struct eSpeakerChannel : uint8_t
{
Unsure = 0,
Left = 1,
Right = 2,
NoStereoData = 0xFF,
};
}
|