Add document comment to RBS

KitaitiMakoto · KitaitiMakoto · commit 4314becc8531 · 2025-04-16T22:04:06.000+09:00
diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs
@@ -24,8 +24,19 @@ module Whisper
 
   class Context
     def self.new: (path | ::URI::HTTP) -> instance
+
+    # transcribe a single file
+    # can emit to a block results
+    #
+    #   params = Whisper::Params.new
+    #   params.duration = 60_000
+    #   whisper.transcribe "path/to/audio.wav", params do |text|
+    #     puts text
+    #   end
+    #
     def transcribe: (string, Params) -> self
                   | (string, Params) { (String) -> void } -> self
+
     def model_n_vocab: () -> Integer
     def model_n_audio_ctx: () -> Integer
     def model_n_audio_state: () -> Integer
@@ -34,19 +45,72 @@ module Whisper
     def model_n_mels: () -> Integer
     def model_ftype: () -> Integer
     def model_type: () -> String
+
+    # Yields each Whisper::Segment:
+    #
+    #   whisper.transcribe("path/to/audio.wav", params)
+    #   whisper.each_segment do |segment|
+    #     puts segment.text
+    #   end
+    #
+    # Returns an Enumerator if no block given:
+    #
+    #   whisper.transcribe("path/to/audio.wav", params)
+    #   enum = whisper.each_segment
+    #   enum.to_a # => [#<Whisper::Segment>, ...]
+    #
     def each_segment: { (Segment) -> void } -> void
                     | () -> Enumerator[Segment]
+
     def model: () -> Model
     def full_get_segment: (Integer nth) -> Segment
     def full_n_segments: () -> Integer
+
+    # Language ID, which can be converted to string by Whisper.lang_str and Whisper.lang_str_full.
+    #
     def full_lang_id: () -> Integer
+
+    # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
+    #
+    #   full_get_segment_t0(3) # => 1668 (16680 ms)
+    #
     def full_get_segment_t0: (Integer) -> Integer
+
+    # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
+    #
+    #   full_get_segment_t1(3) # => 1668 (16680 ms)
+    #
     def full_get_segment_t1: (Integer) -> Integer
+
+    # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
+    #
+    #   full_get_segment_speacker_turn_next(3) # => true
+    #
     def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
+
+    # Text of a segment indexed by +segment_index+.
+    #
+    #   full_get_segment_text(3) # => "ask not what your country can do for you, ..."
+    #
     def full_get_segment_text: (Integer) -> String
+
     def full_get_segment_no_speech_prob: (Integer) -> Float
+
+    # Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+    # Not thread safe for same context
+    # Uses the specified decoding strategy to obtain the text.
+    #
+    # The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
+    #
     def full: (Params, Array[Float] samples, ?Integer n_samples) -> self
             | (Params, _Samples, ?Integer n_samples) -> self
+
+    # Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
+    # Result is stored in the default state of the context
+    # Not thread safe if executed in parallel on the same context.
+    # It seems this approach can offer some speedup in some cases.
+    # However, the transcription accuracy can be worse at the beginning and end of each chunk.
+    #
     def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer n_samples) -> self
                      | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
@@ -85,68 +149,202 @@ module Whisper
       ?abort_callback: abort_callback,
       ?abort_callback_user_data: Object
     ) -> instance
+
+    # params.language = "auto" | "en", etc...
+    #
     def language=: (String) -> String # TODO: Enumerate lang names
+
     def language: () -> String
     def translate=: (boolish) -> boolish
     def translate: () -> (true | false)
     def no_context=: (boolish) -> boolish
+
+    # If true, does not use past transcription (if any) as initial prompt for the decoder.
+    #
     def no_context: () -> (true | false)
+
     def single_segment=: (boolish) -> boolish
+
+    # If true, forces single segment output (useful for streaming).
+    #
     def single_segment: () -> (true | false)
+
     def print_special=: (boolish) -> boolish
+
+    # If true, prints special tokens (e.g. <SOT>, <EOT>, <BEG>, etc.).
+    #
     def print_special: () -> (true | false)
+
     def print_progress=: (boolish) -> boolish
+
+    # If true, prints progress information.
+    #
     def print_progress: () -> (true | false)
+
     def print_realtime=: (boolish) -> boolish
+
+    # If true, prints results from within whisper.cpp. (avoid it, use callback instead)
+    #
     def print_realtime: () -> (true | false)
+
+    # If true, prints timestamps for each text segment when printing realtime.
+    #
     def print_timestamps=: (boolish) -> boolish
+
     def print_timestamps: () -> (true | false)
+
     def suppress_blank=: (boolish) -> boolish
+
+    # If true, suppresses blank outputs.
+    #
     def suppress_blank: () -> (true | false)
+
     def suppress_nst=: (boolish) -> boolish
+
+    # If true, suppresses non-speech-tokens.
+    #
     def suppress_nst: () -> (true | false)
+
     def token_timestamps=: (boolish) -> boolish
+
+    # If true, enables token-level timestamps.
+    #
     def token_timestamps: () -> (true | false)
+
     def split_on_word=: (boolish) -> boolish
+
+    # If true, split on word rather than on token (when used with max_len).
+    #
     def split_on_word: () -> (true | false)
+
     def initial_prompt=: (_ToS) -> _ToS
+
+    # Tokens to provide to the whisper decoder as initial prompt
+    # these are prepended to any existing text context from a previous call
+    # use whisper_tokenize() to convert text to tokens.
+    # Maximum of whisper_n_text_ctx()/2 tokens are used (typically 224).
+    #
     def initial_prompt: () -> (String | nil)
+
     def diarize=: (boolish) -> boolish
+
+    # If true, enables diarization.
+    #
     def diarize: () -> (true | false)
+
     def offset=: (Integer) -> Integer
+
+    # Start offset in ms.
+    #
     def offset: () -> Integer
+
     def duration=: (Integer) -> Integer
+
+    # Audio duration to process in ms.
+    #
     def duration: () -> Integer
+
     def max_text_tokens=: (Integer) -> Integer
+
+    # Max tokens to use from past text as prompt for the decoder.
+    #
     def max_text_tokens: () -> Integer
+
     def temperature=: (Float) -> Float
     def temperature: () -> Float
     def max_initial_ts=: (Float) -> Float
+
+    # See https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
+    #
     def max_initial_ts: () -> Float
+
     def length_penalty=: (Float) -> Float
     def length_penalty: () -> Float
     def temperature_inc=: (Float) -> Float
     def temperature_inc: () -> Float
     def entropy_thold=: (Float) -> Float
+
+    # Similar to OpenAI's "compression_ratio_threshold"
+    #
     def entropy_thold: () -> Float
+
     def logprob_thold=: (Float) -> Float
     def logprob_thold: () -> Float
     def no_speech_thold=: (Float) -> Float
     def no_speech_thold: () -> Float
+
+    # Sets new segment callback, called for every newly generated text segment.
+    #
+    #   params.new_segment_callback = ->(context, _, n_new, user_data) {
+    #     # ...
+    #   }
+    #
     def new_segment_callback=: (new_segment_callback) -> new_segment_callback
     def new_segment_callback: () -> (new_segment_callback | nil)
+
+    # Sets user data passed to the last argument of new segment callback.
+    #
     def new_segment_callback_user_data=: (Object) -> Object
+
     def new_segment_callback_user_data: () -> Object
+
+    # Sets progress callback, called on each progress update.
+    #
+    #   params.new_segment_callback = ->(context, _, progress, user_data) {
+    #     # ...
+    #   }
+    #
+    # +progress+ is an Integer between 0 and 100.
+    #
     def progress_callback=: (progress_callback) -> progress_callback
+
     def progress_callback: () -> (progress_callback | nil)
+
+    # Sets user data passed to the last argument of progress callback.
+    #
     def progress_callback_user_data=: (Object) -> Object
+
     def progress_callback_user_data: () -> Object
+
+    # Sets abort callback, called to check if the process should be aborted.
+    #
+    #   params.abort_callback = ->(user_data) {
+    #     # ...
+    #   }
+    #
+    #
     def abort_callback=: (abort_callback) -> abort_callback
+
     def abort_callback: () -> (abort_callback | nil)
+
+    # Sets user data passed to the last argument of abort callback.
+    #
     def abort_callback_user_data=: (Object) -> Object
+
     def abort_callback_user_data: () -> Object
+
+    # Hook called on new segment. Yields each Whisper::Segment.
+    #
+    #   whisper.on_new_segment do |segment|
+    #     # ...
+    #   end
+    #
     def on_new_segment: { (Segment) -> void } -> void
+
+    # Hook called on progress update. Yields each progress Integer between 0 and 100.
+    #
     def on_progress: { (Integer progress) -> void } -> void
+
+    # Call block to determine whether abort or not. Return +true+ when you want to abort.
+    #
+    #   params.abort_on do
+    #     if some_condition
+    #       true # abort
+    #     else
+    #       false # continue
+    #     end
+    #   end
+    #
     def abort_on: { (Object user_data) -> boolish } -> void
   end
 
@@ -174,9 +372,17 @@ module Whisper
   end
 
   class Segment
+    # Start time in milliseconds.
+    #
     def start_time: () -> Integer
+
+    # End time in milliseconds.
+    #
     def end_time: () -> Integer
+
+    # Whether the next segment is predicted as a speaker turn.
     def speaker_next_turn?: () -> (true | false)
+
     def text: () -> String
     def no_speech_prob: () -> Float
   end