Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions bindings/ruby/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,32 @@ Whisper::Params.new(

For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).

### Output ###

whispercpp supports SRT and WebVTT output:

```ruby
puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
# =>
WEBVTT

1
00:00:00.000 --> 00:00:03.860
My thought I have nobody by a beauty and will as you poured.

2
00:00:03.860 --> 00:00:09.840
Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in

3
00:00:09.840 --> 00:00:09.940
a

```

You may call `#to_srt`, too


API
---

Expand Down Expand Up @@ -196,7 +222,7 @@ whisper
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
line << " (speaker turned)" if segment.speaker_turn_next?
puts line
end

Expand All @@ -212,7 +238,7 @@ params.on_new_segment do |segment|
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
line << " (speaker turned)" if segment.speaker_turn_next?
puts line
end

Expand Down
2 changes: 2 additions & 0 deletions bindings/ruby/ext/ruby_whisper.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,5 +170,7 @@ void Init_whisper() {
init_ruby_whisper_model(&mWhisper);
init_ruby_whisper_vad_params(&mVAD);

rb_require("whisper/context");
rb_require("whisper/segment");
rb_require("whisper/model/uri");
}
2 changes: 1 addition & 1 deletion bindings/ruby/ext/ruby_whisper_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
rb_define_method(cContext, "full", ruby_whisper_full, -1);
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);

// High leve
// High level
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);

Expand Down
79 changes: 78 additions & 1 deletion bindings/ruby/ext/ruby_whisper_segment.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
#include <ruby.h>
#include "ruby_whisper.h"

#define N_KEY_NAMES 5

static VALUE sym_start_time;
static VALUE sym_end_time;
static VALUE sym_text;
static VALUE sym_no_speech_prob;
static VALUE sym_speaker_turn_next;
static VALUE key_names;

extern const rb_data_type_t ruby_whisper_type;

extern VALUE cSegment;
Expand Down Expand Up @@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
}

/*
* call-seq:
* deconstruct_keys(keys) -> hash
*
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
*
* whisper.each_segment do |segment|
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
*
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
* end
*/
static VALUE
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
{
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);

VALUE hash = rb_hash_new();
long n_keys;
if (NIL_P(keys)) {
keys = key_names;
n_keys = N_KEY_NAMES;
} else {
n_keys = RARRAY_LEN(keys);
if (n_keys > N_KEY_NAMES) {
return hash;
}
}
for (int i = 0; i < n_keys; i++) {
VALUE key = rb_ary_entry(keys, i);
if (key == sym_start_time) {
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
}
if (key == sym_end_time) {
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
}
if (key == sym_text) {
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
}
if (key == sym_no_speech_prob) {
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
}
if (key == sym_speaker_turn_next) {
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
}
}

return hash;
}

void
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
{
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);

sym_start_time = ID2SYM(rb_intern("start_time"));
sym_end_time = ID2SYM(rb_intern("end_time"));
sym_text = ID2SYM(rb_intern("text"));
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
key_names = rb_ary_new3(
N_KEY_NAMES,
sym_start_time,
sym_end_time,
sym_text,
sym_no_speech_prob,
sym_speaker_turn_next
);

rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
}
7 changes: 4 additions & 3 deletions bindings/ruby/ext/ruby_whisper_transcribe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
fprintf(stderr, "failed to process audio\n");
return self;
}
if (NIL_P(blk)) {
return self;
}
const int n_segments = whisper_full_n_segments(rw->context);
VALUE output = rb_str_new2("");
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(rw->context, i);
output = rb_str_concat(output, rb_str_new2(text));
}
if (blk != Qnil) {
rb_funcall(blk, id_call, 1, output);
}
rb_funcall(blk, id_call, 1, output);
return self;
}
#ifdef __cplusplus
Expand Down
15 changes: 15 additions & 0 deletions bindings/ruby/lib/whisper/context.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module Whisper
class Context
def to_srt
each_segment.with_index.reduce("") {|srt, (segment, index)|
srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
}
end

def to_webvtt
each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
}
end
end
end
58 changes: 58 additions & 0 deletions bindings/ruby/lib/whisper/segment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module Whisper
class Segment
SRT_ESCAPES = {
"&" => "&amp;",
"<" => "&lt;",
">" => "&gt;",
}
SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE

def to_srt_cue
"#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
end

def to_webvtt_cue
"#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
end

private

def time_to_a(time)
sec, decimal_part = time.divmod(1000)
min, sec = sec.divmod(60)
hour, min = min.divmod(60)
[hour, min, sec, decimal_part]
end

def srt_time(time)
"%02d:%02d:%02d,%03d" % time_to_a(time)
end

def srt_start_time
srt_time(start_time)
end

def srt_end_time
srt_time(end_time)
end

def srt_text
text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
end

def webvtt_time(time)
"%02d:%02d:%02d.%03d" % time_to_a(time)
end

def webvtt_start_time
webvtt_time(start_time)
end

def webvtt_end_time
webvtt_time(end_time)
end

alias webvtt_text srt_text
end
end
24 changes: 23 additions & 1 deletion bindings/ruby/sig/whisper.rbs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ module Whisper
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
| (Params, _Samples, ?Integer n_samples) -> self
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self

def to_srt: () -> String
def to_webvtt: () -> String
end

class Params
Expand Down Expand Up @@ -415,6 +418,14 @@ module Whisper
end

class Segment
type deconstructed_keys = {
start_time: (Integer | nil),
end_time: (Integer | nil),
text: (String | nil),
no_speech_prob: (Float | nil),
speaker_turn_next: (true | false | nil)
}

# Start time in milliseconds.
#
def start_time: () -> Integer
Expand All @@ -424,10 +435,21 @@ module Whisper
def end_time: () -> Integer

# Whether the next segment is predicted as a speaker turn.
def speaker_next_turn?: () -> (true | false)
def speaker_turn_next?: () -> (true | false)

def text: () -> String
def no_speech_prob: () -> Float
def to_srt_cue: () -> String
def to_webvtt_cue: () -> String

# Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
#
# whisper.each_segment do |segment|
# segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
#
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
# end
def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
end

module VAD
Expand Down
62 changes: 62 additions & 0 deletions bindings/ruby/test/test_segment.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,66 @@ def test_on_new_segment_twice
end
whisper.transcribe(AUDIO, params)
end

def test_pattern_matching
segment = whisper.each_segment.first
segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}

assert_equal segment.start_time, start_time
assert_equal segment.end_time, end_time
assert_equal segment.text, text
assert_equal segment.no_speech_prob, no_speech_prob
assert_equal segment.speaker_turn_next?, speaker_turn_next
end

def test_pattern_matching_partial
segment = whisper.each_segment.first
segment => {start_time:, end_time:, text:}

assert_equal segment.start_time, start_time
assert_equal segment.end_time, end_time
assert_equal segment.text, text
end

def test_deconstruct_keys
segment = whisper.each_segment.first
expected = {
start_time: segment.start_time,
end_time: segment.end_time,
text: segment.text,
no_speech_prob: segment.no_speech_prob,
speaker_turn_next: segment.speaker_turn_next?
}
assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
end

def test_deconstruct_keys_non_existent
omit "Undefined behavior"

segment = whisper.each_segment.first

assert_equal({}, segment.deconstruct_keys([:non_existent]))
end

def test_deconstruct_keys_too_many_keys
omit "Undefined behavior"

segment = whisper.each_segment.first

assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
end

def test_deconstruct_keys_includes_non_existent_keys_not_too_many
omit "Undefined behavior"

segment = whisper.each_segment.first

expected = {
start_time: segment.start_time,
end_time: segment.end_time,
text: segment.text,
no_speech_prob: segment.no_speech_prob
}
assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
end
end
Loading
Loading