Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions bindings/ruby/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,6 @@ end

Some models are prepared up-front:

```ruby
base_en = Whisper::Model.pre_converted_models["base.en"]
whisper = Whisper::Context.new(base_en)
```

At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:

```ruby
Whisper::Model.pre_converted_models["base"].clear_cache
```

You also can use shorthand for pre-converted models:

```ruby
Expand All @@ -105,6 +94,19 @@ puts Whisper::Model.pre_converted_models.keys
# :
```

You can also retrieve each model:

```ruby
base_en = Whisper::Model.pre_converted_models["base.en"]
whisper = Whisper::Context.new(base_en)
```

At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:

```ruby
Whisper::Model.pre_converted_models["base"].clear_cache
```

You can also use local model files you prepared:

```ruby
Expand Down Expand Up @@ -163,6 +165,16 @@ For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisp
API
---

### Transcription ###

By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:

```ruby
whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
```

Note that transcription occasionally might be low accuracy when it works in parallel.

### Segments ###

Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
Expand Down Expand Up @@ -297,6 +309,11 @@ First call of `rake test` builds an extension and downloads a model for testing.

If something seems wrong on build, running `rake clean` solves some cases.

### Need help ###

* Windows support
* Refinement of C/C++ code, especially memory management

License
-------

Expand Down
12 changes: 5 additions & 7 deletions bindings/ruby/Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,15 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
end
CLEAN.include LIB_FILE

Rake::TestTask.new do |t|
t.test_files = FileList["tests/test_*.rb"]
end
Rake::TestTask.new

TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
chdir "tests/jfk_reader" do
TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
chdir "test/jfk_reader" do
ruby "extconf.rb"
sh "make"
end
end
CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
CLEAN.include "test/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"

task test: [LIB_FILE, TEST_MEMORY_VIEW]
2 changes: 2 additions & 0 deletions bindings/ruby/ext/ruby_whisper.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ID id_URI;
ID id_pre_converted_models;
ID id_coreml_compiled_models;
ID id_cache;
ID id_n_processors;

static bool is_log_callback_finalized = false;

Expand Down Expand Up @@ -142,6 +143,7 @@ void Init_whisper() {
id_pre_converted_models = rb_intern("pre_converted_models");
id_coreml_compiled_models = rb_intern("coreml_compiled_models");
id_cache = rb_intern("cache");
id_n_processors = rb_intern("n_processors");

mWhisper = rb_define_module("Whisper");
mVAD = rb_define_module_under(mWhisper, "VAD");
Expand Down
5 changes: 5 additions & 0 deletions bindings/ruby/ext/ruby_whisper_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ extern ID id_URI;
extern ID id_pre_converted_models;
extern ID id_coreml_compiled_models;
extern ID id_cache;
extern ID id_n_processors;

extern VALUE cContext;
extern VALUE eError;
Expand All @@ -24,6 +25,8 @@ extern VALUE rb_whisper_model_s_new(VALUE context);
extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);

ID transcribe_option_names[1];

static void
ruby_whisper_free(ruby_whisper *rw)
{
Expand Down Expand Up @@ -633,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper)
{
cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);

transcribe_option_names[0] = id_n_processors;

rb_define_alloc_func(cContext, ruby_whisper_allocate);
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);

Expand Down
15 changes: 10 additions & 5 deletions bindings/ruby/ext/ruby_whisper_transcribe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type;

extern ID id_to_s;
extern ID id_call;
extern ID transcribe_option_names[1];

extern void
prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
Expand All @@ -34,9 +35,14 @@ VALUE
ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
ruby_whisper *rw;
ruby_whisper_params *rwp;
VALUE wave_file_path, blk, params;
VALUE wave_file_path, blk, params, kws;
VALUE opts[1];

rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, &params, &kws, &blk);
rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);

int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);

rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);

Expand Down Expand Up @@ -66,7 +72,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {

prepare_transcription(rwp, &self);

if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
fprintf(stderr, "failed to process audio\n");
return self;
}
Expand All @@ -76,9 +82,8 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
const char * text = whisper_full_get_segment_text(rw->context, i);
output = rb_str_concat(output, rb_str_new2(text));
}
VALUE idCall = id_call;
if (blk != Qnil) {
rb_funcall(blk, idCall, 1, output);
rb_funcall(blk, id_call, 1, output);
}
return self;
}
Expand Down
74 changes: 37 additions & 37 deletions bindings/ruby/sig/whisper.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,19 @@ module Whisper
def self.system_info_str: () -> String

class Context
def self.new: (path | ::URI::HTTP) -> instance
def self.new: (String | path | ::URI::HTTP) -> instance

# transcribe a single file
# can emit to a block results
#
# params = Whisper::Params.new
# params.duration = 60_000
# whisper.transcribe "path/to/audio.wav", params do |text|
# puts text
# end
# params = Whisper::Params.new
# params.duration = 60_000
# whisper.transcribe "path/to/audio.wav", params do |text|
# puts text
# end
#
def transcribe: (string, Params) -> self
| (string, Params) { (String) -> void } -> self
def transcribe: (string, Params, ?n_processors: Integer) -> self
| (string, Params, ?n_processors: Integer) { (String) -> void } -> self

def model_n_vocab: () -> Integer
def model_n_audio_ctx: () -> Integer
Expand All @@ -50,16 +50,16 @@ module Whisper

# Yields each Whisper::Segment:
#
# whisper.transcribe("path/to/audio.wav", params)
# whisper.each_segment do |segment|
# puts segment.text
# end
# whisper.transcribe("path/to/audio.wav", params)
# whisper.each_segment do |segment|
# puts segment.text
# end
#
# Returns an Enumerator if no block given:
#
# whisper.transcribe("path/to/audio.wav", params)
# enum = whisper.each_segment
# enum.to_a # => [#<Whisper::Segment>, ...]
# whisper.transcribe("path/to/audio.wav", params)
# enum = whisper.each_segment
# enum.to_a # => [#<Whisper::Segment>, ...]
#
def each_segment: { (Segment) -> void } -> void
| () -> Enumerator[Segment]
Expand All @@ -74,25 +74,25 @@ module Whisper

# Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
#
# full_get_segment_t0(3) # => 1668 (16680 ms)
# full_get_segment_t0(3) # => 1668 (16680 ms)
#
def full_get_segment_t0: (Integer) -> Integer

# End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
#
# full_get_segment_t1(3) # => 1668 (16680 ms)
# full_get_segment_t1(3) # => 1668 (16680 ms)
#
def full_get_segment_t1: (Integer) -> Integer

# Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
#
# full_get_segment_speacker_turn_next(3) # => true
# full_get_segment_speacker_turn_next(3) # => true
#
def full_get_segment_speaker_turn_next: (Integer) -> (true | false)

# Text of a segment indexed by +segment_index+.
#
# full_get_segment_text(3) # => "ask not what your country can do for you, ..."
# full_get_segment_text(3) # => "ask not what your country can do for you, ..."
#
def full_get_segment_text: (Integer) -> String

Expand Down Expand Up @@ -282,9 +282,9 @@ module Whisper

# Sets new segment callback, called for every newly generated text segment.
#
# params.new_segment_callback = ->(context, _, n_new, user_data) {
# # ...
# }
# params.new_segment_callback = ->(context, _, n_new, user_data) {
# # ...
# }
#
def new_segment_callback=: (new_segment_callback) -> new_segment_callback
def new_segment_callback: () -> (new_segment_callback | nil)
Expand All @@ -297,9 +297,9 @@ module Whisper

# Sets progress callback, called on each progress update.
#
# params.new_segment_callback = ->(context, _, progress, user_data) {
# # ...
# }
# params.new_segment_callback = ->(context, _, progress, user_data) {
# # ...
# }
#
# +progress+ is an Integer between 0 and 100.
#
Expand Down Expand Up @@ -327,9 +327,9 @@ module Whisper

# Sets abort callback, called to check if the process should be aborted.
#
# params.abort_callback = ->(user_data) {
# # ...
# }
# params.abort_callback = ->(user_data) {
# # ...
# }
#
#
def abort_callback=: (abort_callback) -> abort_callback
Expand Down Expand Up @@ -358,9 +358,9 @@ module Whisper

# Hook called on new segment. Yields each Whisper::Segment.
#
# whisper.on_new_segment do |segment|
# # ...
# end
# whisper.on_new_segment do |segment|
# # ...
# end
#
def on_new_segment: { (Segment) -> void } -> void

Expand All @@ -374,13 +374,13 @@ module Whisper

# Call block to determine whether abort or not. Return +true+ when you want to abort.
#
# params.abort_on do
# if some_condition
# true # abort
# else
# false # continue
# params.abort_on do
# if some_condition
# true # abort
# else
# false # continue
# end
# end
# end
#
def abort_on: { (Object user_data) -> boolish } -> void
end
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,24 @@ def test_whisper
}
end

def test_transcribe_non_parallel
@whisper = Whisper::Context.new("base.en")
params = Whisper::Params.new

@whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
}
end

def test_transcribe_n_processors
@whisper = Whisper::Context.new("base.en")
params = Whisper::Params.new

@whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
}
end

sub_test_case "After transcription" do
def test_full_n_segments
assert_equal 1, whisper.full_n_segments
Expand Down
4 changes: 2 additions & 2 deletions bindings/ruby/whispercpp.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Gem::Specification.new do |s|
s.name = "whispercpp"
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
s.version = '1.3.3'
s.date = '2025-06-01'
s.date = '2025-06-03'
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
s.email = '[email protected]'
s.extra_rdoc_files = ['LICENSE', 'README.md']
Expand All @@ -21,7 +21,7 @@ Gem::Specification.new do |s|
}

s.summary = %q{Ruby whisper.cpp bindings}
s.test_files = s.files.select {|file| file.start_with? "tests/"}
s.test_files = s.files.select {|file| file.start_with? "test/"}

s.extensions << 'ext/extconf.rb'
s.required_ruby_version = '>= 3.1.0'
Expand Down
Loading