Skip to content

Commit fbead67

Browse files
ruby : output format (#3237)
* Fix a typo * Don't allocate output string unless needed * Add methods to output SRT and WebVTT * Add tests for output methods * Make constants for output private * Add signatures for output methods * Add document on output methods * Fix method name: Segment#speaker_next_turn? -> #speacker_turn_next? * Add Whisper::Segment#descotruct_keys * Add test for Whisper::Context#descotruct_keys * Add signature of Whisper::Segment#deconstruct_keys * Use parentheses to suppress warning * Update date
1 parent d78f081 commit fbead67

File tree

11 files changed

+317
-10
lines changed

11 files changed

+317
-10
lines changed

bindings/ruby/README.md

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,32 @@ Whisper::Params.new(
162162

163163
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
164164

165+
### Output ###
166+
167+
whispercpp supports SRT and WebVTT output:
168+
169+
```ruby
170+
puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
171+
# =>
172+
WEBVTT
173+
174+
1
175+
00:00:00.000 --> 00:00:03.860
176+
My thought I have nobody by a beauty and will as you poured.
177+
178+
2
179+
00:00:03.860 --> 00:00:09.840
180+
Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
181+
182+
3
183+
00:00:09.840 --> 00:00:09.940
184+
a
185+
186+
```
187+
188+
You may call `#to_srt`, too
189+
190+
165191
API
166192
---
167193
@@ -196,7 +222,7 @@ whisper
196222
ed: format_time(segment.end_time),
197223
text: segment.text
198224
}
199-
line << " (speaker turned)" if segment.speaker_next_turn?
225+
line << " (speaker turned)" if segment.speaker_turn_next?
200226
puts line
201227
end
202228
@@ -212,7 +238,7 @@ params.on_new_segment do |segment|
212238
ed: format_time(segment.end_time),
213239
text: segment.text
214240
}
215-
line << " (speaker turned)" if segment.speaker_next_turn?
241+
line << " (speaker turned)" if segment.speaker_turn_next?
216242
puts line
217243
end
218244

bindings/ruby/ext/ruby_whisper.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,5 +170,7 @@ void Init_whisper() {
170170
init_ruby_whisper_model(&mWhisper);
171171
init_ruby_whisper_vad_params(&mVAD);
172172

173+
rb_require("whisper/context");
174+
rb_require("whisper/segment");
173175
rb_require("whisper/model/uri");
174176
}

bindings/ruby/ext/ruby_whisper_context.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
664664
rb_define_method(cContext, "full", ruby_whisper_full, -1);
665665
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
666666

667-
// High leve
667+
// High level
668668
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
669669
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
670670

bindings/ruby/ext/ruby_whisper_segment.c

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
#include <ruby.h>
22
#include "ruby_whisper.h"
33

4+
#define N_KEY_NAMES 5
5+
6+
static VALUE sym_start_time;
7+
static VALUE sym_end_time;
8+
static VALUE sym_text;
9+
static VALUE sym_no_speech_prob;
10+
static VALUE sym_speaker_turn_next;
11+
static VALUE key_names;
12+
413
extern const rb_data_type_t ruby_whisper_type;
514

615
extern VALUE cSegment;
@@ -129,15 +138,83 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
129138
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
130139
}
131140

141+
/*
142+
* call-seq:
143+
* deconstruct_keys(keys) -> hash
144+
*
145+
* Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
146+
*
147+
* whisper.each_segment do |segment|
148+
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
149+
*
150+
* puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
151+
* end
152+
*/
153+
static VALUE
154+
ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
155+
{
156+
ruby_whisper_segment *rws;
157+
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
158+
ruby_whisper *rw;
159+
TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
160+
161+
VALUE hash = rb_hash_new();
162+
long n_keys;
163+
if (NIL_P(keys)) {
164+
keys = key_names;
165+
n_keys = N_KEY_NAMES;
166+
} else {
167+
n_keys = RARRAY_LEN(keys);
168+
if (n_keys > N_KEY_NAMES) {
169+
return hash;
170+
}
171+
}
172+
for (int i = 0; i < n_keys; i++) {
173+
VALUE key = rb_ary_entry(keys, i);
174+
if (key == sym_start_time) {
175+
rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
176+
}
177+
if (key == sym_end_time) {
178+
rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
179+
}
180+
if (key == sym_text) {
181+
rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
182+
}
183+
if (key == sym_no_speech_prob) {
184+
rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
185+
}
186+
if (key == sym_speaker_turn_next) {
187+
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
188+
}
189+
}
190+
191+
return hash;
192+
}
193+
132194
void
133195
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
134196
{
135197
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
136198

199+
sym_start_time = ID2SYM(rb_intern("start_time"));
200+
sym_end_time = ID2SYM(rb_intern("end_time"));
201+
sym_text = ID2SYM(rb_intern("text"));
202+
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
203+
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
204+
key_names = rb_ary_new3(
205+
N_KEY_NAMES,
206+
sym_start_time,
207+
sym_end_time,
208+
sym_text,
209+
sym_no_speech_prob,
210+
sym_speaker_turn_next
211+
);
212+
137213
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
138214
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
139215
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
140-
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
216+
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
141217
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
142218
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
219+
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
143220
}

bindings/ruby/ext/ruby_whisper_transcribe.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,15 +76,16 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
7676
fprintf(stderr, "failed to process audio\n");
7777
return self;
7878
}
79+
if (NIL_P(blk)) {
80+
return self;
81+
}
7982
const int n_segments = whisper_full_n_segments(rw->context);
8083
VALUE output = rb_str_new2("");
8184
for (int i = 0; i < n_segments; ++i) {
8285
const char * text = whisper_full_get_segment_text(rw->context, i);
8386
output = rb_str_concat(output, rb_str_new2(text));
8487
}
85-
if (blk != Qnil) {
86-
rb_funcall(blk, id_call, 1, output);
87-
}
88+
rb_funcall(blk, id_call, 1, output);
8889
return self;
8990
}
9091
#ifdef __cplusplus

bindings/ruby/lib/whisper/context.rb

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
module Whisper
2+
class Context
3+
def to_srt
4+
each_segment.with_index.reduce("") {|srt, (segment, index)|
5+
srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
6+
}
7+
end
8+
9+
def to_webvtt
10+
each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
11+
webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
12+
}
13+
end
14+
end
15+
end

bindings/ruby/lib/whisper/segment.rb

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
module Whisper
2+
class Segment
3+
SRT_ESCAPES = {
4+
"&" => "&amp;",
5+
"<" => "&lt;",
6+
">" => "&gt;",
7+
}
8+
SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
9+
private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
10+
11+
def to_srt_cue
12+
"#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
13+
end
14+
15+
def to_webvtt_cue
16+
"#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
17+
end
18+
19+
private
20+
21+
def time_to_a(time)
22+
sec, decimal_part = time.divmod(1000)
23+
min, sec = sec.divmod(60)
24+
hour, min = min.divmod(60)
25+
[hour, min, sec, decimal_part]
26+
end
27+
28+
def srt_time(time)
29+
"%02d:%02d:%02d,%03d" % time_to_a(time)
30+
end
31+
32+
def srt_start_time
33+
srt_time(start_time)
34+
end
35+
36+
def srt_end_time
37+
srt_time(end_time)
38+
end
39+
40+
def srt_text
41+
text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
42+
end
43+
44+
def webvtt_time(time)
45+
"%02d:%02d:%02d.%03d" % time_to_a(time)
46+
end
47+
48+
def webvtt_start_time
49+
webvtt_time(start_time)
50+
end
51+
52+
def webvtt_end_time
53+
webvtt_time(end_time)
54+
end
55+
56+
alias webvtt_text srt_text
57+
end
58+
end

bindings/ruby/sig/whisper.rbs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ module Whisper
116116
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
117117
| (Params, _Samples, ?Integer n_samples) -> self
118118
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
119+
120+
def to_srt: () -> String
121+
def to_webvtt: () -> String
119122
end
120123

121124
class Params
@@ -415,6 +418,14 @@ module Whisper
415418
end
416419

417420
class Segment
421+
type deconstructed_keys = {
422+
start_time: (Integer | nil),
423+
end_time: (Integer | nil),
424+
text: (String | nil),
425+
no_speech_prob: (Float | nil),
426+
speaker_turn_next: (true | false | nil)
427+
}
428+
418429
# Start time in milliseconds.
419430
#
420431
def start_time: () -> Integer
@@ -424,10 +435,21 @@ module Whisper
424435
def end_time: () -> Integer
425436

426437
# Whether the next segment is predicted as a speaker turn.
427-
def speaker_next_turn?: () -> (true | false)
438+
def speaker_turn_next?: () -> (true | false)
428439

429440
def text: () -> String
430441
def no_speech_prob: () -> Float
442+
def to_srt_cue: () -> String
443+
def to_webvtt_cue: () -> String
444+
445+
# Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
446+
#
447+
# whisper.each_segment do |segment|
448+
# segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
449+
#
450+
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
451+
# end
452+
def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
431453
end
432454

433455
module VAD

bindings/ruby/test/test_segment.rb

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,66 @@ def test_on_new_segment_twice
7171
end
7272
whisper.transcribe(AUDIO, params)
7373
end
74+
75+
def test_pattern_matching
76+
segment = whisper.each_segment.first
77+
segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
78+
79+
assert_equal segment.start_time, start_time
80+
assert_equal segment.end_time, end_time
81+
assert_equal segment.text, text
82+
assert_equal segment.no_speech_prob, no_speech_prob
83+
assert_equal segment.speaker_turn_next?, speaker_turn_next
84+
end
85+
86+
def test_pattern_matching_partial
87+
segment = whisper.each_segment.first
88+
segment => {start_time:, end_time:, text:}
89+
90+
assert_equal segment.start_time, start_time
91+
assert_equal segment.end_time, end_time
92+
assert_equal segment.text, text
93+
end
94+
95+
def test_deconstruct_keys
96+
segment = whisper.each_segment.first
97+
expected = {
98+
start_time: segment.start_time,
99+
end_time: segment.end_time,
100+
text: segment.text,
101+
no_speech_prob: segment.no_speech_prob,
102+
speaker_turn_next: segment.speaker_turn_next?
103+
}
104+
assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
105+
end
106+
107+
def test_deconstruct_keys_non_existent
108+
omit "Undefined behavior"
109+
110+
segment = whisper.each_segment.first
111+
112+
assert_equal({}, segment.deconstruct_keys([:non_existent]))
113+
end
114+
115+
def test_deconstruct_keys_too_many_keys
116+
omit "Undefined behavior"
117+
118+
segment = whisper.each_segment.first
119+
120+
assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
121+
end
122+
123+
def test_deconstruct_keys_includes_non_existent_keys_not_too_many
124+
omit "Undefined behavior"
125+
126+
segment = whisper.each_segment.first
127+
128+
expected = {
129+
start_time: segment.start_time,
130+
end_time: segment.end_time,
131+
text: segment.text,
132+
no_speech_prob: segment.no_speech_prob
133+
}
134+
assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
135+
end
74136
end

0 commit comments

Comments
 (0)