@@ -24,8 +24,19 @@ module Whisper
2424
2525 class Context
2626 def self.new : (path | ::URI::HTTP) -> instance
27+
28+ # transcribe a single file
29+ # can emit to a block results
30+ #
31+ # params = Whisper::Params.new
32+ # params.duration = 60_000
33+ # whisper.transcribe "path/to/audio.wav", params do |text|
34+ # puts text
35+ # end
36+ #
2737 def transcribe : (string, Params) -> self
2838 | (string, Params) { (String) -> void } -> self
39+
2940 def model_n_vocab : () -> Integer
3041 def model_n_audio_ctx : () -> Integer
3142 def model_n_audio_state : () -> Integer
@@ -34,19 +45,72 @@ module Whisper
3445 def model_n_mels : () -> Integer
3546 def model_ftype : () -> Integer
3647 def model_type : () -> String
48+
49+ # Yields each Whisper::Segment:
50+ #
51+ # whisper.transcribe("path/to/audio.wav", params)
52+ # whisper.each_segment do |segment|
53+ # puts segment.text
54+ # end
55+ #
56+ # Returns an Enumerator if no block given:
57+ #
58+ # whisper.transcribe("path/to/audio.wav", params)
59+ # enum = whisper.each_segment
60+ # enum.to_a # => [#<Whisper::Segment>, ...]
61+ #
3762 def each_segment : { (Segment) -> void } -> void
3863 | () -> Enumerator[Segment]
64+
3965 def model : () -> Model
4066 def full_get_segment : (Integer nth) -> Segment
4167 def full_n_segments : () -> Integer
68+
69+ # Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
70+ #
4271 def full_lang_id : () -> Integer
72+
73+ # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
74+ #
75+ # full_get_segment_t0(3) # => 1668 (16680 ms)
76+ #
4377 def full_get_segment_t0 : (Integer) -> Integer
78+
79+ # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
80+ #
81+ # full_get_segment_t1(3) # => 1668 (16680 ms)
82+ #
4483 def full_get_segment_t1 : (Integer) -> Integer
84+
85+ # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
86+ #
87+ # full_get_segment_speacker_turn_next(3) # => true
88+ #
4589 def full_get_segment_speaker_turn_next : (Integer) -> (true | false)
90+
91+ # Text of a segment indexed by +segment_index+.
92+ #
93+ # full_get_segment_text(3) # => "ask not what your country can do for you, ..."
94+ #
4695 def full_get_segment_text : (Integer) -> String
96+
4797 def full_get_segment_no_speech_prob : (Integer) -> Float
98+
99+ # Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
100+ # Not thread safe for same context
101+ # Uses the specified decoding strategy to obtain the text.
102+ #
103+ # The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
104+ #
48105 def full : (Params, Array[Float] samples, ?Integer n_samples) -> self
49106 | (Params, _Samples, ?Integer n_samples) -> self
107+
108+ # Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
109+ # Result is stored in the default state of the context
110+ # Not thread safe if executed in parallel on the same context.
111+ # It seems this approach can offer some speedup in some cases.
112+ # However, the transcription accuracy can be worse at the beginning and end of each chunk.
113+ #
50114 def full_parallel : (Params, Array[Float], ?Integer n_samples) -> self
51115 | (Params, _Samples, ?Integer n_samples) -> self
52116 | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
@@ -85,68 +149,202 @@ module Whisper
85149 ?abort_callback: abort_callback,
86150 ?abort_callback_user_data: Object
87151 ) -> instance
152+
153+ # params.language = "auto" | "en", etc...
154+ #
88155 def language= : (String) -> String # TODO: Enumerate lang names
156+
89157 def language : () -> String
90158 def translate= : (boolish) -> boolish
91159 def translate : () -> (true | false)
92160 def no_context= : (boolish) -> boolish
161+
162+ # If true, does not use past transcription (if any) as initial prompt for the decoder.
163+ #
93164 def no_context : () -> (true | false)
165+
94166 def single_segment= : (boolish) -> boolish
167+
168+ # If true, forces single segment output (useful for streaming).
169+ #
95170 def single_segment : () -> (true | false)
171+
96172 def print_special= : (boolish) -> boolish
173+
174+ # If true, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
175+ #
97176 def print_special : () -> (true | false)
177+
98178 def print_progress= : (boolish) -> boolish
179+
180+ # If true, prints progress information.
181+ #
99182 def print_progress : () -> (true | false)
183+
100184 def print_realtime= : (boolish) -> boolish
185+
186+ # If true, prints results from within whisper.cpp. (avoid it, use callback instead)
187+ #
101188 def print_realtime : () -> (true | false)
189+
190+ # If true, prints timestamps for each text segment when printing realtime.
191+ #
102192 def print_timestamps= : (boolish) -> boolish
193+
103194 def print_timestamps : () -> (true | false)
195+
104196 def suppress_blank= : (boolish) -> boolish
197+
198+ # If true, suppresses blank outputs.
199+ #
105200 def suppress_blank : () -> (true | false)
201+
106202 def suppress_nst= : (boolish) -> boolish
203+
204+ # If true, suppresses non-speech-tokens.
205+ #
107206 def suppress_nst : () -> (true | false)
207+
108208 def token_timestamps= : (boolish) -> boolish
209+
210+ # If true, enables token-level timestamps.
211+ #
109212 def token_timestamps : () -> (true | false)
213+
110214 def split_on_word= : (boolish) -> boolish
215+
216+ # If true, split on word rather than on token (when used with max_len).
217+ #
111218 def split_on_word : () -> (true | false)
219+
112220 def initial_prompt= : (_ToS) -> _ToS
221+
222+ # Tokens to provide to the whisper decoder as initial prompt
223+ # these are prepended to any existing text context from a previous call
224+ # use whisper_tokenize() to convert text to tokens.
225+ # Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
226+ #
113227 def initial_prompt : () -> (String | nil )
228+
114229 def diarize= : (boolish) -> boolish
230+
231+ # If true, enables diarization.
232+ #
115233 def diarize : () -> (true | false)
234+
116235 def offset= : (Integer) -> Integer
236+
237+ # Start offset in ms.
238+ #
117239 def offset : () -> Integer
240+
118241 def duration= : (Integer) -> Integer
242+
243+ # Audio duration to process in ms.
244+ #
119245 def duration : () -> Integer
246+
120247 def max_text_tokens= : (Integer) -> Integer
248+
249+ # Max tokens to use from past text as prompt for the decoder.
250+ #
121251 def max_text_tokens : () -> Integer
252+
122253 def temperature= : (Float) -> Float
123254 def temperature : () -> Float
124255 def max_initial_ts= : (Float) -> Float
256+
257+ # See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
258+ #
125259 def max_initial_ts : () -> Float
260+
126261 def length_penalty= : (Float) -> Float
127262 def length_penalty : () -> Float
128263 def temperature_inc= : (Float) -> Float
129264 def temperature_inc : () -> Float
130265 def entropy_thold= : (Float) -> Float
266+
267+ # Similar to OpenAI's "compression_ratio_threshold"
268+ #
131269 def entropy_thold : () -> Float
270+
132271 def logprob_thold= : (Float) -> Float
133272 def logprob_thold : () -> Float
134273 def no_speech_thold= : (Float) -> Float
135274 def no_speech_thold : () -> Float
275+
276+ # Sets new segment callback, called for every newly generated text segment.
277+ #
278+ # params.new_segment_callback = ->(context, _, n_new, user_data) {
279+ # # ...
280+ # }
281+ #
136282 def new_segment_callback= : (new_segment_callback) -> new_segment_callback
137283 def new_segment_callback : () -> (new_segment_callback | nil )
284+
285+ # Sets user data passed to the last argument of new segment callback.
286+ #
138287 def new_segment_callback_user_data= : (Object) -> Object
288+
139289 def new_segment_callback_user_data : () -> Object
290+
291+ # Sets progress callback, called on each progress update.
292+ #
293+ # params.new_segment_callback = ->(context, _, progress, user_data) {
294+ # # ...
295+ # }
296+ #
297+ # +progress+ is an Integer between 0 and 100.
298+ #
140299 def progress_callback= : (progress_callback) -> progress_callback
300+
141301 def progress_callback : () -> (progress_callback | nil )
302+
303+ # Sets user data passed to the last argument of progress callback.
304+ #
142305 def progress_callback_user_data= : (Object) -> Object
306+
143307 def progress_callback_user_data : () -> Object
308+
309+ # Sets abort callback, called to check if the process should be aborted.
310+ #
311+ # params.abort_callback = ->(user_data) {
312+ # # ...
313+ # }
314+ #
315+ #
144316 def abort_callback= : (abort_callback) -> abort_callback
317+
145318 def abort_callback : () -> (abort_callback | nil )
319+
320+ # Sets user data passed to the last argument of abort callback.
321+ #
146322 def abort_callback_user_data= : (Object) -> Object
323+
147324 def abort_callback_user_data : () -> Object
325+
326+ # Hook called on new segment. Yields each Whisper::Segment.
327+ #
328+ # whisper.on_new_segment do |segment|
329+ # # ...
330+ # end
331+ #
148332 def on_new_segment : { (Segment) -> void } -> void
333+
334+ # Hook called on progress update. Yields each progress Integer between 0 and 100.
335+ #
149336 def on_progress : { (Integer progress) -> void } -> void
337+
338+ # Call block to determine whether abort or not. Return +true+ when you want to abort.
339+ #
340+ # params.abort_on do
341+ # if some_condition
342+ # true # abort
343+ # else
344+ # false # continue
345+ # end
346+ # end
347+ #
150348 def abort_on : { (Object user_data) -> boolish } -> void
151349 end
152350
@@ -174,9 +372,17 @@ module Whisper
174372 end
175373
176374 class Segment
375+ # Start time in milliseconds.
376+ #
177377 def start_time : () -> Integer
378+
379+ # End time in milliseconds.
380+ #
178381 def end_time : () -> Integer
382+
383+ # Whether the next segment is predicted as a speaker turn.
179384 def speaker_next_turn? : () -> (true | false)
385+
180386 def text : () -> String
181387 def no_speech_prob : () -> Float
182388 end
0 commit comments