summaryrefslogtreecommitdiffstats
path: root/WhisperNet/API/Parameters.cs
blob: d2b53f97ea278614c785f36ae5e0cb42823d2827 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Missing XML comment for publicly visible type or member
// TODO: remove this line and document them.
#pragma warning disable CS1591

namespace Whisper
{
	/// <summary>Available sampling strategies</summary>
	public enum eSamplingStrategy: int
	{
		/// <summary>Always select the most probable token</summary>
		Greedy,
		/// <summary>TODO: not implemented yet!</summary>
		BeamSearch,
	};

	[Flags]
	public enum eFullParamsFlags: uint
	{
		None = 0,
		Translate = 1,
		NoContext = 2,
		SingleSegment = 4,
		PrintSpecial = 8,
		PrintProgress = 0x10,
		PrintRealtime = 0x20,
		PrintTimestamps = 0x40,

		// Experimental
		TokenTimestamps = 0x100,
		SpeedupAudio = 0x200,
	};

	/// <summary>Transcribe parameters</summary>
	public struct Parameters
	{
		/// <summary>Sampling strategy</summary>
		public eSamplingStrategy strategy;

		/// <summary>Count of CPU worker threads to use</summary>
		/// <remarks>So far, the GPU model only uses CPU threads for MEL spectrograms</remarks>
		public int cpuThreads;

		public int n_max_text_ctx;
		/// <summary>start offset in ms</summary>
		public int offset_ms;
		/// <summary>audio duration to process in ms</summary>
		public int duration_ms;
		public eFullParamsFlags flags;

		/// <summary>Set or clear the specified flag in the <see cref="flags" /> field of this structure</summary>
		public void setFlag( eFullParamsFlags flag, bool set )
		{
			if( flag != eFullParamsFlags.None )
			{
				if( set )
					flags |= flag;
				else
					flags &= ~flag;
				return;
			}
			throw new ArgumentException();
		}

		/// <summary>Language</summary>
		public eLanguage language;

		// [EXPERIMENTAL] token-level timestamps
		/// <summary>timestamp token probability threshold (~0.01)</summary>
		public float thold_pt;
		/// <summary>timestamp token sum probability threshold (~0.01)</summary>
		public float thold_ptsum;
		/// <summary>max segment length in characters</summary>
		public int max_len;
		/// <summary>max tokens per segment (0 = no limit)</summary>
		public int max_tokens;

		public struct sGreedy
		{
			public int n_past;
		}
		public sGreedy greedy;

		public struct sBeamSearch
		{
			public int n_past;
			public int beam_width;
			public int n_best;
		}
		public sBeamSearch beamSearch;

		// [EXPERIMENTAL] speed-up techniques
		/// <summary>overwrite the audio context size (0 = use default)</summary>
		public int audioContextSize;
	}
}