summaryrefslogtreecommitdiffstats
path: root/Whisper/API/TranscribeStructs.h
blob: ac28357b1f0545708f6b2c01f5f0c759f1627864 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#pragma once
#include <stdint.h>
#include <assert.h>

namespace Whisper
{
	enum struct eModelImplementation : uint32_t
	{
		GPU = 1,
		Hybrid = 2,
		Reference = 3,
	};

	struct sTimeSpanFields
	{
		uint32_t days;
		uint8_t hours, minutes, seconds;
		uint32_t ticks;

		sTimeSpanFields( uint64_t tt )
		{
			ticks = (uint32_t)( tt % 10'000'000 );
			tt /= 10'000'000;
			seconds = (uint8_t)( tt % 60 );
			tt /= 60;
			minutes = (uint8_t)( tt % 60 );
			tt /= 60;
			hours = (uint8_t)( tt % 24 );
			tt /= 24;
			days = (uint32_t)tt;
		}
	};

	struct sTimeSpan
	{
		uint64_t ticks;

		operator sTimeSpanFields() const
		{
			return sTimeSpanFields{ ticks };
		}
		void operator=( uint64_t tt )
		{
			ticks = tt;
		}
		void operator=( int64_t tt )
		{
			assert( tt >= 0 );
			ticks = (uint64_t)tt;
		}
	};

	// Start and end times of the segment or token, expressed in 100-nanosecond ticks
	struct sTimeInterval
	{
		sTimeSpan begin, end;
	};

	// Segment data
	struct sSegment
	{
		// Segment text, null-terminated, and probably UTF-8 encoded
		const char* text;
		// Start and end times of the segment
		sTimeInterval time;
		uint32_t firstToken, countTokens;
	};

	enum eTokenFlags : uint32_t
	{
		None = 0,
		Special = 1,
	};
	inline bool operator &( eTokenFlags a, eTokenFlags b )
	{
		return 0 != ( (uint32_t)a & (uint32_t)b );
	}

	// Token data
	struct sToken
	{
		// Token text, null-terminated, and probably UTF-8 encoded
		const char* text;
		// Start and end times of the token
		sTimeInterval time;
		// Probability of the token
		float probability;
		// Probability of the timestamp token
		float probabilityTimestamp;
		// Sum of probabilities of all timestamp tokens
		float ptsum;
		// Voice length of the token
		float vlen;
		// Token id
		int id;
		eTokenFlags flags;
	};

	struct sTranscribeLength
	{
		uint32_t countSegments, countTokens;
	};

	enum struct eResultFlags : uint32_t
	{
		None = 0,
		// Return individual tokens in addition to the segments
		Tokens = 1,
		// Return timestamps
		Timestamps = 2,

		// Create a new COM object for the results.
		// Without this flag, the context returns a pointer to the COM object stored in the context.
		// The content of that object is replaced every time you call iContext.getResults method
		NewObject = 0x100,
	};

	inline eResultFlags operator |( eResultFlags a, eResultFlags b )
	{
		return (eResultFlags)( (uint32_t)a | (uint32_t)b );
	}

	inline bool operator &( eResultFlags a, eResultFlags b )
	{
		return 0 != ( (uint32_t)a & (uint32_t)b );
	}
}