Skip to content

Commit 4314bec

Browse files
committed
Add document comment to RBS
1 parent 97f9975 commit 4314bec

File tree

1 file changed

+206
-0
lines changed

1 file changed

+206
-0
lines changed

bindings/ruby/sig/whisper.rbs

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,19 @@ module Whisper
2424

2525
class Context
2626
def self.new: (path | ::URI::HTTP) -> instance
27+
28+
# transcribe a single file
29+
# can emit to a block results
30+
#
31+
# params = Whisper::Params.new
32+
# params.duration = 60_000
33+
# whisper.transcribe "path/to/audio.wav", params do |text|
34+
# puts text
35+
# end
36+
#
2737
def transcribe: (string, Params) -> self
2838
| (string, Params) { (String) -> void } -> self
39+
2940
def model_n_vocab: () -> Integer
3041
def model_n_audio_ctx: () -> Integer
3142
def model_n_audio_state: () -> Integer
@@ -34,19 +45,72 @@ module Whisper
3445
def model_n_mels: () -> Integer
3546
def model_ftype: () -> Integer
3647
def model_type: () -> String
48+
49+
# Yields each Whisper::Segment:
50+
#
51+
# whisper.transcribe("path/to/audio.wav", params)
52+
# whisper.each_segment do |segment|
53+
# puts segment.text
54+
# end
55+
#
56+
# Returns an Enumerator if no block given:
57+
#
58+
# whisper.transcribe("path/to/audio.wav", params)
59+
# enum = whisper.each_segment
60+
# enum.to_a # => [#<Whisper::Segment>, ...]
61+
#
3762
def each_segment: { (Segment) -> void } -> void
3863
| () -> Enumerator[Segment]
64+
3965
def model: () -> Model
4066
def full_get_segment: (Integer nth) -> Segment
4167
def full_n_segments: () -> Integer
68+
69+
# Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
70+
#
4271
def full_lang_id: () -> Integer
72+
73+
# Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
74+
#
75+
# full_get_segment_t0(3) # => 1668 (16680 ms)
76+
#
4377
def full_get_segment_t0: (Integer) -> Integer
78+
79+
# End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
80+
#
81+
# full_get_segment_t1(3) # => 1668 (16680 ms)
82+
#
4483
def full_get_segment_t1: (Integer) -> Integer
84+
85+
# Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
86+
#
87+
# full_get_segment_speacker_turn_next(3) # => true
88+
#
4589
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
90+
91+
# Text of a segment indexed by +segment_index+.
92+
#
93+
# full_get_segment_text(3) # => "ask not what your country can do for you, ..."
94+
#
4695
def full_get_segment_text: (Integer) -> String
96+
4797
def full_get_segment_no_speech_prob: (Integer) -> Float
98+
99+
# Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
100+
# Not thread safe for same context
101+
# Uses the specified decoding strategy to obtain the text.
102+
#
103+
# The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
104+
#
48105
def full: (Params, Array[Float] samples, ?Integer n_samples) -> self
49106
| (Params, _Samples, ?Integer n_samples) -> self
107+
108+
# Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
109+
# Result is stored in the default state of the context
110+
# Not thread safe if executed in parallel on the same context.
111+
# It seems this approach can offer some speedup in some cases.
112+
# However, the transcription accuracy can be worse at the beginning and end of each chunk.
113+
#
50114
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
51115
| (Params, _Samples, ?Integer n_samples) -> self
52116
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
@@ -85,68 +149,202 @@ module Whisper
85149
?abort_callback: abort_callback,
86150
?abort_callback_user_data: Object
87151
) -> instance
152+
153+
# params.language = "auto" | "en", etc...
154+
#
88155
def language=: (String) -> String # TODO: Enumerate lang names
156+
89157
def language: () -> String
90158
def translate=: (boolish) -> boolish
91159
def translate: () -> (true | false)
92160
def no_context=: (boolish) -> boolish
161+
162+
# If true, does not use past transcription (if any) as initial prompt for the decoder.
163+
#
93164
def no_context: () -> (true | false)
165+
94166
def single_segment=: (boolish) -> boolish
167+
168+
# If true, forces single segment output (useful for streaming).
169+
#
95170
def single_segment: () -> (true | false)
171+
96172
def print_special=: (boolish) -> boolish
173+
174+
# If true, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
175+
#
97176
def print_special: () -> (true | false)
177+
98178
def print_progress=: (boolish) -> boolish
179+
180+
# If true, prints progress information.
181+
#
99182
def print_progress: () -> (true | false)
183+
100184
def print_realtime=: (boolish) -> boolish
185+
186+
# If true, prints results from within whisper.cpp. (avoid it, use callback instead)
187+
#
101188
def print_realtime: () -> (true | false)
189+
190+
# If true, prints timestamps for each text segment when printing realtime.
191+
#
102192
def print_timestamps=: (boolish) -> boolish
193+
103194
def print_timestamps: () -> (true | false)
195+
104196
def suppress_blank=: (boolish) -> boolish
197+
198+
# If true, suppresses blank outputs.
199+
#
105200
def suppress_blank: () -> (true | false)
201+
106202
def suppress_nst=: (boolish) -> boolish
203+
204+
# If true, suppresses non-speech-tokens.
205+
#
107206
def suppress_nst: () -> (true | false)
207+
108208
def token_timestamps=: (boolish) -> boolish
209+
210+
# If true, enables token-level timestamps.
211+
#
109212
def token_timestamps: () -> (true | false)
213+
110214
def split_on_word=: (boolish) -> boolish
215+
216+
# If true, split on word rather than on token (when used with max_len).
217+
#
111218
def split_on_word: () -> (true | false)
219+
112220
def initial_prompt=: (_ToS) -> _ToS
221+
222+
# Tokens to provide to the whisper decoder as initial prompt
223+
# these are prepended to any existing text context from a previous call
224+
# use whisper_tokenize() to convert text to tokens.
225+
# Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
226+
#
113227
def initial_prompt: () -> (String | nil)
228+
114229
def diarize=: (boolish) -> boolish
230+
231+
# If true, enables diarization.
232+
#
115233
def diarize: () -> (true | false)
234+
116235
def offset=: (Integer) -> Integer
236+
237+
# Start offset in ms.
238+
#
117239
def offset: () -> Integer
240+
118241
def duration=: (Integer) -> Integer
242+
243+
# Audio duration to process in ms.
244+
#
119245
def duration: () -> Integer
246+
120247
def max_text_tokens=: (Integer) -> Integer
248+
249+
# Max tokens to use from past text as prompt for the decoder.
250+
#
121251
def max_text_tokens: () -> Integer
252+
122253
def temperature=: (Float) -> Float
123254
def temperature: () -> Float
124255
def max_initial_ts=: (Float) -> Float
256+
257+
# See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
258+
#
125259
def max_initial_ts: () -> Float
260+
126261
def length_penalty=: (Float) -> Float
127262
def length_penalty: () -> Float
128263
def temperature_inc=: (Float) -> Float
129264
def temperature_inc: () -> Float
130265
def entropy_thold=: (Float) -> Float
266+
267+
# Similar to OpenAI's "compression_ratio_threshold"
268+
#
131269
def entropy_thold: () -> Float
270+
132271
def logprob_thold=: (Float) -> Float
133272
def logprob_thold: () -> Float
134273
def no_speech_thold=: (Float) -> Float
135274
def no_speech_thold: () -> Float
275+
276+
# Sets new segment callback, called for every newly generated text segment.
277+
#
278+
# params.new_segment_callback = ->(context, _, n_new, user_data) {
279+
# # ...
280+
# }
281+
#
136282
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
137283
def new_segment_callback: () -> (new_segment_callback | nil)
284+
285+
# Sets user data passed to the last argument of new segment callback.
286+
#
138287
def new_segment_callback_user_data=: (Object) -> Object
288+
139289
def new_segment_callback_user_data: () -> Object
290+
291+
# Sets progress callback, called on each progress update.
292+
#
293+
# params.new_segment_callback = ->(context, _, progress, user_data) {
294+
# # ...
295+
# }
296+
#
297+
# +progress+ is an Integer between 0 and 100.
298+
#
140299
def progress_callback=: (progress_callback) -> progress_callback
300+
141301
def progress_callback: () -> (progress_callback | nil)
302+
303+
# Sets user data passed to the last argument of progress callback.
304+
#
142305
def progress_callback_user_data=: (Object) -> Object
306+
143307
def progress_callback_user_data: () -> Object
308+
309+
# Sets abort callback, called to check if the process should be aborted.
310+
#
311+
# params.abort_callback = ->(user_data) {
312+
# # ...
313+
# }
314+
#
315+
#
144316
def abort_callback=: (abort_callback) -> abort_callback
317+
145318
def abort_callback: () -> (abort_callback | nil)
319+
320+
# Sets user data passed to the last argument of abort callback.
321+
#
146322
def abort_callback_user_data=: (Object) -> Object
323+
147324
def abort_callback_user_data: () -> Object
325+
326+
# Hook called on new segment. Yields each Whisper::Segment.
327+
#
328+
# whisper.on_new_segment do |segment|
329+
# # ...
330+
# end
331+
#
148332
def on_new_segment: { (Segment) -> void } -> void
333+
334+
# Hook called on progress update. Yields each progress Integer between 0 and 100.
335+
#
149336
def on_progress: { (Integer progress) -> void } -> void
337+
338+
# Call block to determine whether abort or not. Return +true+ when you want to abort.
339+
#
340+
# params.abort_on do
341+
# if some_condition
342+
# true # abort
343+
# else
344+
# false # continue
345+
# end
346+
# end
347+
#
150348
def abort_on: { (Object user_data) -> boolish } -> void
151349
end
152350

@@ -174,9 +372,17 @@ module Whisper
174372
end
175373

176374
class Segment
375+
# Start time in milliseconds.
376+
#
177377
def start_time: () -> Integer
378+
379+
# End time in milliseconds.
380+
#
178381
def end_time: () -> Integer
382+
383+
# Whether the next segment is predicted as a speaker turn.
179384
def speaker_next_turn?: () -> (true | false)
385+
180386
def text: () -> String
181387
def no_speech_prob: () -> Float
182388
end

0 commit comments

Comments
 (0)