Skip to content

Commit fe416b4

Browse files
committed
misc sync improvements (see HISTORY.srt for 0.4.11)
1 parent e33f117 commit fe416b4

File tree

6 files changed

+62
-39
lines changed

6 files changed

+62
-39
lines changed

HISTORY.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,10 @@ History
179179
* Filter out metadata in subtitles when extracting speech;
180180
* Add experimental --golden-section-search over framerate ratio (off by default);
181181
* Try to improve sync by inferring framerate ratio based on relative duration of synced vs unsynced;
182+
183+
0.4.11 (2021-01-29)
184+
-------------------
185+
* Misc sync improvements:
186+
* Have webrtcvad use '0' as the non speech label instead of 0.5;
187+
* Allow the vad non speech label to be specified via the --non-speech-label command line parameter;
188+
* Don't try to infer framerate ratio based on length between first and last speech frames for non-subtitle speech detection;

ffsubsync/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
FRAMERATE_RATIOS = [24./23.976, 25./23.976, 25./24.]
77

88
DEFAULT_FRAME_RATE = 48000
9+
DEFAULT_NON_SPEECH_LABEL = 0.
910
DEFAULT_ENCODING = 'infer'
1011
DEFAULT_MAX_SUBTITLE_SECONDS = 10
1112
DEFAULT_START_SECONDS = 0

ffsubsync/ffsubsync.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def try_sync(args, reference_pipe, result):
116116
continue
117117
else:
118118
srt_pipe.fit(srtin)
119-
if not args.skip_infer_framerate_ratio:
119+
if not args.skip_infer_framerate_ratio and hasattr(reference_pipe[-1], 'num_frames'):
120120
inferred_framerate_ratio_from_length = float(reference_pipe[-1].num_frames) / srt_pipes[0][-1].num_frames
121121
logger.info('inferred frameratio ratio: %.3f' % inferred_framerate_ratio_from_length)
122122
srt_pipes.append(srt_pipe_maker(inferred_framerate_ratio_from_length).fit(srtin))
@@ -185,7 +185,7 @@ def make_reference_pipe(args):
185185
if args.vad is not None:
186186
logger.warning('Vad specified, but reference was not a movie')
187187
return Pipeline([
188-
('deserialize', DeserializeSpeechTransformer())
188+
('deserialize', DeserializeSpeechTransformer(args.non_speech_label))
189189
])
190190
else:
191191
vad = args.vad or DEFAULT_VAD
@@ -195,14 +195,17 @@ def make_reference_pipe(args):
195195
if ref_stream is not None and not ref_stream.startswith('0:'):
196196
ref_stream = '0:' + ref_stream
197197
return Pipeline([
198-
('speech_extract', VideoSpeechTransformer(vad=vad,
199-
sample_rate=SAMPLE_RATE,
200-
frame_rate=args.frame_rate,
201-
start_seconds=args.start_seconds,
202-
ffmpeg_path=args.ffmpeg_path,
203-
ref_stream=ref_stream,
204-
vlc_mode=args.vlc_mode,
205-
gui_mode=args.gui_mode))
198+
('speech_extract', VideoSpeechTransformer(
199+
vad=vad,
200+
sample_rate=SAMPLE_RATE,
201+
frame_rate=args.frame_rate,
202+
non_speech_label=args.non_speech_label,
203+
start_seconds=args.start_seconds,
204+
ffmpeg_path=args.ffmpeg_path,
205+
ref_stream=ref_stream,
206+
vlc_mode=args.vlc_mode,
207+
gui_mode=args.gui_mode
208+
)),
206209
])
207210

208211

@@ -392,6 +395,8 @@ def add_cli_only_args(parser):
392395
help='Frame rate for audio extraction (default=%d).' % DEFAULT_FRAME_RATE)
393396
parser.add_argument('--skip-infer-framerate-ratio', action='store_true',
394397
help='If set, do not try to infer framerate ratio based on duration ratio.')
398+
parser.add_argument('--non-speech-label', type=float, default=DEFAULT_NON_SPEECH_LABEL,
399+
help='Label to use for frames detected as non-speech (default=%f)' % DEFAULT_NON_SPEECH_LABEL)
395400
parser.add_argument('--output-encoding', default='utf-8',
396401
help='What encoding to use for writing output subtitles '
397402
'(default=utf-8). Can indicate "same" to use same '

ffsubsync/speech_transformers.py

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def subpipe_maker(framerate_ratio):
5959
return subpipe_maker(scale_factor)
6060

6161

62-
def _make_auditok_detector(sample_rate, frame_rate):
62+
def _make_auditok_detector(sample_rate, frame_rate, non_speech_label):
6363
try:
6464
from auditok import \
6565
BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer
@@ -76,31 +76,37 @@ def _make_auditok_detector(sample_rate, frame_rate):
7676
bytes_per_frame = 2
7777
frames_per_window = frame_rate // sample_rate
7878
validator = AudioEnergyValidator(
79-
sample_width=bytes_per_frame, energy_threshold=50)
79+
sample_width=bytes_per_frame, energy_threshold=50
80+
)
8081
tokenizer = StreamTokenizer(
81-
validator=validator, min_length=0.2*sample_rate,
82-
max_length=int(5*sample_rate),
83-
max_continuous_silence=0.25*sample_rate)
82+
validator=validator,
83+
min_length=0.2 * sample_rate,
84+
max_length=int(5 * sample_rate),
85+
max_continuous_silence=0.25 * sample_rate
86+
)
8487

8588
def _detect(asegment):
86-
asource = BufferAudioSource(data_buffer=asegment,
87-
sampling_rate=frame_rate,
88-
sample_width=bytes_per_frame,
89-
channels=1)
89+
asource = BufferAudioSource(
90+
data_buffer=asegment,
91+
sampling_rate=frame_rate,
92+
sample_width=bytes_per_frame,
93+
channels=1
94+
)
9095
ads = ADSFactory.ads(audio_source=asource, block_dur=1./sample_rate)
9196
ads.open()
9297
tokens = tokenizer.tokenize(ads)
93-
length = (len(asegment)//bytes_per_frame
94-
+ frames_per_window - 1)//frames_per_window
95-
media_bstring = np.zeros(length+1, dtype=int)
98+
length = (
99+
len(asegment)//bytes_per_frame + frames_per_window - 1
100+
) // frames_per_window
101+
media_bstring = np.zeros(length + 1)
96102
for token in tokens:
97-
media_bstring[token[1]] += 1
98-
media_bstring[token[2]+1] -= 1
99-
return (np.cumsum(media_bstring)[:-1] > 0).astype(float)
103+
media_bstring[token[1]] = 1.
104+
media_bstring[token[2] + 1] = non_speech_label - 1.
105+
return np.clip(np.cumsum(media_bstring)[:-1], 0., 1.)
100106
return _detect
101107

102108

103-
def _make_webrtcvad_detector(sample_rate, frame_rate):
109+
def _make_webrtcvad_detector(sample_rate, frame_rate, non_speech_label):
104110
import webrtcvad
105111
vad = webrtcvad.Vad()
106112
vad.set_mode(3) # set non-speech pruning aggressiveness from 0 to 3
@@ -123,7 +129,7 @@ def _detect(asegment):
123129
is_speech = False
124130
failures += 1
125131
# webrtcvad has low recall on mode 3, so treat non-speech as "not sure"
126-
media_bstring.append(1. if is_speech else 0.5)
132+
media_bstring.append(1. if is_speech else non_speech_label)
127133
return np.array(media_bstring)
128134

129135
return _detect
@@ -141,20 +147,23 @@ def num_frames(self):
141147
return self.end_frame_ - self.start_frame_
142148

143149
def fit_boundaries(self, speech_frames):
144-
nz = np.nonzero(speech_frames)[0]
150+
nz = np.nonzero(speech_frames > 0.5)[0]
145151
if len(nz) > 0:
146152
self.start_frame_ = np.min(nz)
147153
self.end_frame_ = np.max(nz)
148154
return self
149155

150156

151-
class VideoSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin):
152-
def __init__(self, vad, sample_rate, frame_rate, start_seconds=0,
153-
ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False):
157+
class VideoSpeechTransformer(TransformerMixin):
158+
def __init__(
159+
self, vad, sample_rate, frame_rate, non_speech_label, start_seconds=0,
160+
ffmpeg_path=None, ref_stream=None, vlc_mode=False, gui_mode=False
161+
):
154162
super(VideoSpeechTransformer, self).__init__()
155163
self.vad = vad
156164
self.sample_rate = sample_rate
157165
self.frame_rate = frame_rate
166+
self._non_speech_label = non_speech_label
158167
self.start_seconds = start_seconds
159168
self.ffmpeg_path = ffmpeg_path
160169
self.ref_stream = ref_stream
@@ -197,7 +206,6 @@ def try_fit_using_embedded_subs(self, fname):
197206
# use longest set of embedded subs
198207
subs_to_use = embedded_subs[int(np.argmax(embedded_subs_times))]
199208
self.video_speech_results_ = subs_to_use.subtitle_speech_results_
200-
self.fit_boundaries(self.video_speech_results_)
201209

202210
def fit(self, fname, *_):
203211
if 'subs' in self.vad and (self.ref_stream is None or self.ref_stream.startswith('0:s:')):
@@ -216,9 +224,9 @@ def fit(self, fname, *_):
216224
logger.warning(e)
217225
total_duration = None
218226
if 'webrtc' in self.vad:
219-
detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate)
227+
detector = _make_webrtcvad_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
220228
elif 'auditok' in self.vad:
221-
detector = _make_auditok_detector(self.sample_rate, self.frame_rate)
229+
detector = _make_auditok_detector(self.sample_rate, self.frame_rate, self._non_speech_label)
222230
else:
223231
raise ValueError('unknown vad: %s' % self.vad)
224232
media_bstring = []
@@ -284,7 +292,6 @@ def redirect_stderr(enter_result=None):
284292
'Unable to detect speech. Perhaps try specifying a different stream / track, or a different vad.'
285293
)
286294
self.video_speech_results_ = np.concatenate(media_bstring)
287-
self.fit_boundaries(self.video_speech_results_)
288295
return self
289296

290297
def transform(self, *_):
@@ -300,6 +307,7 @@ def transform(self, *_):
300307
}
301308

302309

310+
# TODO: need way better metadata detector
303311
def _is_metadata(content, is_beginning_or_end):
304312
content = content.strip()
305313
if len(content) == 0:
@@ -348,9 +356,10 @@ def transform(self, *_):
348356
return self.subtitle_speech_results_
349357

350358

351-
class DeserializeSpeechTransformer(TransformerMixin, ComputeSpeechFrameBoundariesMixin):
352-
def __init__(self):
359+
class DeserializeSpeechTransformer(TransformerMixin):
360+
def __init__(self, non_speech_label):
353361
super(DeserializeSpeechTransformer, self).__init__()
362+
self._non_speech_label = non_speech_label
354363
self.deserialized_speech_results_ = None
355364

356365
def fit(self, fname, *_):
@@ -361,8 +370,8 @@ def fit(self, fname, *_):
361370
else:
362371
raise ValueError('could not find "speech" array in '
363372
'serialized file; only contains: %s' % speech.files)
373+
speech[speech < 1.] = self._non_speech_label
364374
self.deserialized_speech_results_ = speech
365-
self.fit_boundaries(self.deserialized_speech_results_)
366375
return self
367376

368377
def transform(self, *_):

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# testing
22
flake8
33
pytest
4+
pytest-cov
45
pyyaml;python_version!="3.4"
56
twine;python_version!="3.4"
67
versioneer

test-data

0 commit comments

Comments
 (0)