summaryrefslogtreecommitdiffstats
path: root/Whisper/API/TranscribeStructs.h
blob: 98ed4c0a98cfb900adf2d337ac6d71b0e53ea5e8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#pragma once
#include <stdint.h>
#include <assert.h>

namespace Whisper
{
	enum struct eModelImplementation : uint32_t
	{
		// GPGPU implementation based on Direct3D 11.0 compute shaders
		GPU = 1,

		// A hybrid implementation which uses DirectCompute for encode, and decodes on CPU
		// Not implemented in the published builds of the DLL. To enable, change BUILD_HYBRID_VERSION macro to 1
		Hybrid = 2,

		// A reference implementation which uses the original GGML CPU-running code
		// Not implemented in the published builds of the DLL. To enable, change BUILD_BOTH_VERSIONS macro to 1
		Reference = 3,
	};

	// Timespan structure decomposed into fields
	struct sTimeSpanFields
	{
		uint32_t days;
		uint8_t hours, minutes, seconds;
		uint32_t ticks;

		sTimeSpanFields( uint64_t tt )
		{
			ticks = (uint32_t)( tt % 10'000'000 );
			tt /= 10'000'000;
			seconds = (uint8_t)( tt % 60 );
			tt /= 60;
			minutes = (uint8_t)( tt % 60 );
			tt /= 60;
			hours = (uint8_t)( tt % 24 );
			tt /= 24;
			days = (uint32_t)tt;
		}
	};

	// C++ equivalent of System.Timespan C# structure
	struct sTimeSpan
	{
		// The value is expressed in 100-nanoseconds ticks: compatible with System.Timespan, FILETIME, and many other things
		uint64_t ticks;

		operator sTimeSpanFields() const
		{
			return sTimeSpanFields{ ticks };
		}
		void operator=( uint64_t tt )
		{
			ticks = tt;
		}
		void operator=( int64_t tt )
		{
			assert( tt >= 0 );
			ticks = (uint64_t)tt;
		}
	};

	// Start and end times of the segment or token, expressed in 100-nanosecond ticks
	struct sTimeInterval
	{
		sTimeSpan begin, end;
	};

	// Segment data
	struct sSegment
	{
		// Segment text, null-terminated, and probably UTF-8 encoded
		const char* text;
		// Start and end times of the segment
		sTimeInterval time;
		// These two integers define the slice of the tokens in this segment, in the array returned by iTranscribeResult.getTokens method
		uint32_t firstToken, countTokens;
	};

	enum eTokenFlags : uint32_t
	{
		None = 0,
		Special = 1,
	};
	inline bool operator &( eTokenFlags a, eTokenFlags b )
	{
		return 0 != ( (uint32_t)a & (uint32_t)b );
	}

	// Token data
	struct sToken
	{
		// Token text, null-terminated, and usually UTF-8 encoded.
		// I think for Chinese language the models sometimes outputs invalid UTF8 strings here, Unicode code points can be split between adjacent tokens in the same segment
		// More info: https://github.com/ggerganov/whisper.cpp/issues/399
		const char* text;
		// Start and end times of the token
		sTimeInterval time;
		// Probability of the token
		float probability;
		// Probability of the timestamp token
		float probabilityTimestamp;
		// Sum of probabilities of all timestamp tokens
		float ptsum;
		// Voice length of the token
		float vlen;
		// Token id
		int id;
		eTokenFlags flags;
	};

	struct sTranscribeLength
	{
		uint32_t countSegments, countTokens;
	};

	enum struct eResultFlags : uint32_t
	{
		None = 0,
		// Return individual tokens in addition to the segments
		Tokens = 1,
		// Return timestamps
		Timestamps = 2,

		// Create a new COM object for the results.
		// Without this flag, the context returns a pointer to the COM object stored in the context.
		// The content of that object is replaced every time you call iContext.getResults method
		NewObject = 0x100,
	};

	inline eResultFlags operator |( eResultFlags a, eResultFlags b )
	{
		return (eResultFlags)( (uint32_t)a | (uint32_t)b );
	}

	inline bool operator &( eResultFlags a, eResultFlags b )
	{
		return 0 != ( (uint32_t)a & (uint32_t)b );
	}

	// Output value for iContext.detectSpeaker method
	enum struct eSpeakerChannel : uint8_t
	{
		Unsure = 0,
		Left = 1,
		Right = 2,
		NoStereoData = 0xFF,
	};
}