Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ec83e47
Add Whisper::VAD::Context
KitaitiMakoto Nov 10, 2025
3e18da0
Add test for Whisper::VAD::Context
KitaitiMakoto Nov 10, 2025
9a7572d
Add Whisper::VAD::Segment
KitaitiMakoto Nov 11, 2025
775a014
Add Whisper::VAD::Segments
KitaitiMakoto Nov 11, 2025
b9407de
Add Whisper::VAD::Context#detect
KitaitiMakoto Nov 11, 2025
b938ea6
Define Whisper::VAD::Segments#each
KitaitiMakoto Nov 11, 2025
494104a
Define Whisper::VAD::Segment#start_time and #end_time
KitaitiMakoto Nov 11, 2025
006bc60
Define Whisper::VAD::Segment#deconstruct_keys
KitaitiMakoto Nov 11, 2025
de57dd9
Add tests for Whisper::VAD family
KitaitiMakoto Nov 11, 2025
09eba1c
Add signatures for VAD family
KitaitiMakoto Nov 11, 2025
a581048
Add document on VAD in README
KitaitiMakoto Nov 11, 2025
ae40ff4
Define Whisper::VAD::Segments#length
KitaitiMakoto Nov 11, 2025
1330b52
Add test for Whisper::VAD::Segments#length
KitaitiMakoto Nov 11, 2025
d7dffb7
Add signature of Segments#length
KitaitiMakoto Nov 11, 2025
c0c8f0b
Make vad_segments responsible to initialize VAD::Segments
KitaitiMakoto Nov 12, 2025
a8ce4ee
Remove meaningless argument check
KitaitiMakoto Nov 12, 2025
e0d1b0b
Check NULL of segments member
KitaitiMakoto Nov 12, 2025
42441d7
Add tests for Whisper::VAD::Segments
KitaitiMakoto Nov 12, 2025
a625718
Initialize Whisper::VAD::Segment on .allocate
KitaitiMakoto Nov 12, 2025
f69336b
Add tests for Whisper::VAD::Segment
KitaitiMakoto Nov 12, 2025
b89944e
Check NULL of context member
KitaitiMakoto Nov 12, 2025
781a3ac
Add test for Whisper::VAD::Context.allocate
KitaitiMakoto Nov 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions bindings/ruby/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,22 @@ whisper

The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.

Using VAD separately from ASR
-----------------------------

VAD feature itself is useful. You can use it separately from ASR:

```ruby
vad = Whisper::VAD::Context.new("silero-v5.1.2")
vad
.detect("path/to/audio.wav", Whisper::VAD::Params.new)
.each_with_index do |segment, index|
segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys`

puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:}
end
```

Development
-----------

Expand Down
9 changes: 9 additions & 0 deletions bindings/ruby/ext/ruby_whisper.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ VALUE mWhisper;
VALUE mVAD;
VALUE cContext;
VALUE cParams;
VALUE cVADContext;
VALUE cVADParams;
VALUE cVADSegments;
VALUE cVADSegment;
VALUE eError;

VALUE cSegment;
Expand Down Expand Up @@ -37,6 +40,9 @@ extern void init_ruby_whisper_error(VALUE *mWhisper);
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
extern void init_ruby_whisper_model(VALUE *mWhisper);
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
extern void init_ruby_whisper_vad_context(VALUE *mVAD);
extern void init_ruby_whisper_vad_segment(VALUE *mVAD);
extern void init_ruby_whisper_vad_segments(VALUE *mVAD);
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);

/*
Expand Down Expand Up @@ -170,6 +176,9 @@ void Init_whisper() {
init_ruby_whisper_segment(&mWhisper, &cContext);
init_ruby_whisper_model(&mWhisper);
init_ruby_whisper_vad_params(&mVAD);
init_ruby_whisper_vad_segment(&mVAD);
init_ruby_whisper_vad_segments(&mVAD);
init_ruby_whisper_vad_context(&mVAD);

rb_require("whisper/context");
rb_require("whisper/segment");
Expand Down
13 changes: 13 additions & 0 deletions bindings/ruby/ext/ruby_whisper.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,17 @@ typedef struct {
VALUE context;
} ruby_whisper_model;

typedef struct {
struct whisper_vad_segments *segments;
} ruby_whisper_vad_segments;

typedef struct {
VALUE segments;
int index;
} ruby_whisper_vad_segment;

typedef struct {
struct whisper_vad_context *context;
} ruby_whisper_vad_context;

#endif
3 changes: 3 additions & 0 deletions bindings/ruby/ext/ruby_whisper_segment.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ ruby_whisper_segment_memsize(const void *p)
if (!rws) {
return 0;
}
if (rws->index) {
size += sizeof(rws->index);
}
return size;
}

Expand Down
75 changes: 75 additions & 0 deletions bindings/ruby/ext/ruby_whisper_vad_context.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include <ruby.h>
#include "ruby_whisper.h"

extern ID id_to_s;

extern VALUE cVADContext;

extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params);
extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);

static size_t
ruby_whisper_vad_context_memsize(const void *p)
{
const ruby_whisper_vad_context *rwvc = p;
size_t size = sizeof(rwvc);
if (!rwvc) {
return 0;
}
if (rwvc->context) {
size += sizeof(rwvc->context);
}
return size;
}

static void
ruby_whisper_vad_context_free(void *p)
{
ruby_whisper_vad_context *rwvc = (ruby_whisper_vad_context *)p;
if (rwvc->context) {
whisper_vad_free(rwvc->context);
rwvc->context = NULL;
}
xfree(rwvc);
}

const rb_data_type_t ruby_whisper_vad_context_type = {
"ruby_whisper_vad_context",
{0, ruby_whisper_vad_context_free, ruby_whisper_vad_context_memsize,},
0, 0,
0
};

static VALUE
ruby_whisper_vad_context_s_allocate(VALUE klass)
{
ruby_whisper_vad_context *rwvc;
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
rwvc->context = NULL;
return obj;
}

static VALUE
ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path)
{
ruby_whisper_vad_context *rwvc;
struct whisper_vad_context *context;

model_path = ruby_whisper_normalize_model_path(model_path);
context = whisper_vad_init_from_file_with_params(StringValueCStr(model_path), whisper_vad_default_context_params());
if (context == NULL) {
rb_raise(rb_eRuntimeError, "Failed to initialize whisper VAD context");
}
TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
rwvc->context = context;

return Qnil;
}

void init_ruby_whisper_vad_context(VALUE *mVAD)
{
cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject);
rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate);
rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1);
rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2);
}
50 changes: 50 additions & 0 deletions bindings/ruby/ext/ruby_whisper_vad_context_detect.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#include <ruby.h>
#include "ruby_whisper.h"
#include "common-whisper.h"
#include <string>
#include <vector>

#ifdef __cplusplus
extern "C" {
#endif

extern VALUE cVADSegments;

extern const rb_data_type_t ruby_whisper_vad_context_type;
extern const rb_data_type_t ruby_whisper_vad_params_type;
extern const rb_data_type_t ruby_whisper_vad_segments_type;

extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments);

VALUE
ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) {
ruby_whisper_vad_context *rwvc;
ruby_whisper_vad_params *rwvp;
std::string cpp_file_path;
std::vector<float> pcmf32;
std::vector<std::vector<float>> pcmf32s;
whisper_vad_segments *segments;

TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
if (rwvc->context == NULL) {
rb_raise(rb_eRuntimeError, "Doesn't have referenxe to context internally");
}
TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);

cpp_file_path = StringValueCStr(file_path);

if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) {
rb_raise(rb_eRuntimeError, "Failed to open '%s' as WAV file\n", cpp_file_path.c_str());
}

segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, pcmf32.data(), pcmf32.size());
if (segments == nullptr) {
rb_raise(rb_eRuntimeError, "Failed to process audio\n");
}

return ruby_whisper_vad_segments_s_init(segments);
}

#ifdef __cplusplus
}
#endif
141 changes: 141 additions & 0 deletions bindings/ruby/ext/ruby_whisper_vad_segment.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#include <ruby.h>
#include "ruby_whisper.h"

#define N_KEY_NAMES 2

extern VALUE cVADSegment;

extern const rb_data_type_t ruby_whisper_vad_segments_type;

static VALUE sym_start_time;
static VALUE sym_end_time;
static VALUE key_names;

static void
rb_whisper_vad_segment_mark(void *p)
{
ruby_whisper_vad_segment *rwvs = (ruby_whisper_vad_segment *)p;
rb_gc_mark(rwvs->segments);
}

static size_t
ruby_whisper_vad_segment_memsize(const void *p)
{
const ruby_whisper_vad_segment *rwvs = p;
size_t size = sizeof(rwvs);
if (!rwvs) {
return 0;
}
if (rwvs->index) {
size += sizeof(rwvs->index);
}
return size;
}

static const rb_data_type_t ruby_whisper_vad_segment_type = {
"ruby_whisper_vad_segment",
{rb_whisper_vad_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_vad_segment_memsize,},
0, 0,
0
};

static VALUE
ruby_whisper_vad_segment_s_allocate(VALUE klass)
{
ruby_whisper_vad_segment *rwvs;
VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
rwvs->segments = Qnil;
rwvs->index = -1;
return obj;
}

VALUE
rb_whisper_vad_segment_s_new(VALUE segments, int index)
{
ruby_whisper_vad_segment *rwvs;
const VALUE segment = ruby_whisper_vad_segment_s_allocate(cVADSegment);
TypedData_Get_Struct(segment, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
rwvs->segments = segments;
rwvs->index = index;
return segment;
}

static VALUE
ruby_whisper_vad_segment_get_start_time(VALUE self)
{
ruby_whisper_vad_segment *rwvs;
ruby_whisper_vad_segments *rwvss;
float t0;

TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
t0 = whisper_vad_segments_get_segment_t0(rwvss->segments, rwvs->index);
return DBL2NUM(t0 * 10);
}

static VALUE
ruby_whisper_vad_segment_get_end_time(VALUE self)
{
ruby_whisper_vad_segment *rwvs;
ruby_whisper_vad_segments *rwvss;
float t1;

TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
t1 = whisper_vad_segments_get_segment_t1(rwvss->segments, rwvs->index);
return DBL2NUM(t1 * 10);
}

static VALUE
ruby_whisper_vad_segment_deconstruct_keys(VALUE self, VALUE keys)
{
ruby_whisper_vad_segment *rwvs;
ruby_whisper_vad_segments *rwvss;
VALUE hash, key;
long n_keys;
int i;

TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);

hash = rb_hash_new();
if (NIL_P(keys)) {
keys = key_names;
n_keys = N_KEY_NAMES;
} else {
n_keys = RARRAY_LEN(keys);
if (n_keys > N_KEY_NAMES) {
return hash;
}
}
for (i = 0; i < n_keys; i++) {
key = rb_ary_entry(keys, i);
if (key == sym_start_time) {
rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_start_time(self));
}
if (key == sym_end_time) {
rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_end_time(self));
}
}

return hash;
}

void
init_ruby_whisper_vad_segment(VALUE *mVAD)
{
cVADSegment = rb_define_class_under(*mVAD, "Segment", rb_cObject);

sym_start_time = ID2SYM(rb_intern("start_time"));
sym_end_time = ID2SYM(rb_intern("end_time"));
key_names = rb_ary_new3(
N_KEY_NAMES,
sym_start_time,
sym_end_time
);

rb_define_alloc_func(cVADSegment, ruby_whisper_vad_segment_s_allocate);
rb_define_method(cVADSegment, "start_time", ruby_whisper_vad_segment_get_start_time, 0);
rb_define_method(cVADSegment, "end_time", ruby_whisper_vad_segment_get_end_time, 0);
rb_define_method(cVADSegment, "deconstruct_keys", ruby_whisper_vad_segment_deconstruct_keys, 1);
}
Loading
Loading