Skip to content

Commit 6975d36

Browse files
committed
experimental golden section search; change max offset seconds to 60; misc bugfixes for 0.4.9 release
1 parent e17077d commit 6975d36

File tree

9 files changed

+190
-50
lines changed

9 files changed

+190
-50
lines changed

HISTORY.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,10 @@ History
165165
0.4.8 (2020-09-22)
166166
------------------
167167
* Use webrtcvad-wheels on Windows to eliminate dependency on compiler;
168+
169+
0.4.9 (2020-10-11)
170+
------------------
171+
* Make default offset seconds 60 and enforce during alignment as opposed to throwing away alignments with > max_offset_seconds;
172+
* Add experimental section for using golden section search to find framerate ratio;
173+
* Restore ability to read stdin and write stdout after buggy permissions check;
174+
* Exceptions that occur during syncing were mistakenly suppressed; this is now fixed;

ffsubsync/aligners.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
import math
44

55
import numpy as np
6+
7+
from .constants import FRAMERATE_RATIOS
8+
from .golden_section_search import gss
69
from .sklearn_shim import TransformerMixin
710

811
logging.basicConfig(level=logging.INFO)
@@ -14,11 +17,25 @@ class FailedToFindAlignmentException(Exception):
1417

1518

1619
class FFTAligner(TransformerMixin):
17-
def __init__(self):
20+
def __init__(self, max_offset_samples=None):
21+
self.max_offset_samples = max_offset_samples
1822
self.best_offset_ = None
1923
self.best_score_ = None
2024
self.get_score_ = False
2125

26+
def _zero_out_extreme_offsets(self, convolve, substring):
27+
convolve = np.copy(convolve)
28+
if self.max_offset_samples is None:
29+
return convolve
30+
offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring)
31+
convolve[:offset_to_index(-self.max_offset_samples)] = convolve[offset_to_index(self.max_offset_samples):] = 0
32+
return convolve
33+
34+
def _compute_argmax(self, convolve, substring):
35+
best_idx = np.argmax(convolve)
36+
self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
37+
self.best_score_ = convolve[best_idx]
38+
2239
def fit(self, refstring, substring, get_score=False):
2340
refstring, substring = [
2441
list(map(int, s))
@@ -33,9 +50,9 @@ def fit(self, refstring, substring, get_score=False):
3350
subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring))
3451
refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0))
3552
convolve = np.real(np.fft.ifft(subft * refft))
36-
best_idx = np.argmax(convolve)
37-
self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
38-
self.best_score_ = convolve[best_idx]
53+
self._compute_argmax(self._zero_out_extreme_offsets(convolve, substring), substring)
54+
if self.best_score_ == 0.:
55+
self._compute_argmax(convolve, substring)
3956
self.get_score_ = get_score
4057
return self
4158

@@ -47,24 +64,40 @@ def transform(self, *_):
4764

4865

4966
class MaxScoreAligner(TransformerMixin):
50-
def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None):
67+
def __init__(self, base_aligner, srtin=None, sample_rate=None, max_offset_seconds=None):
68+
self.srtin = srtin
69+
if sample_rate is None or max_offset_seconds is None:
70+
self.max_offset_samples = None
71+
else:
72+
self.max_offset_samples = abs(int(max_offset_seconds * sample_rate))
5173
if isinstance(base_aligner, type):
52-
self.base_aligner = base_aligner()
74+
self.base_aligner = base_aligner(max_offset_samples=self.max_offset_samples)
5375
else:
5476
self.base_aligner = base_aligner
5577
self.max_offset_seconds = max_offset_seconds
56-
if sample_rate is None or max_offset_seconds is None:
57-
self.max_offset_samples = None
58-
else:
59-
self.max_offset_samples = abs(max_offset_seconds * sample_rate)
6078
self._scores = []
6179

80+
def fit_gss(self, refstring, subpipe_maker):
81+
def opt_func(framerate_ratio, is_last_iter):
82+
subpipe = subpipe_maker(framerate_ratio)
83+
substring = subpipe.fit_transform(self.srtin)
84+
score = self.base_aligner.fit_transform(refstring, substring, get_score=True)
85+
logger.info('got score %.0f (offset %d) for ratio %.3f', score[0], score[1], framerate_ratio)
86+
if is_last_iter:
87+
self._scores.append((score, subpipe))
88+
return -score[0]
89+
gss(opt_func, 0.9, 1.1)
90+
return self
91+
6292
def fit(self, refstring, subpipes):
6393
if not isinstance(subpipes, list):
6494
subpipes = [subpipes]
6595
for subpipe in subpipes:
66-
if hasattr(subpipe, 'transform'):
67-
substring = subpipe.transform(None)
96+
if callable(subpipe):
97+
self.fit_gss(refstring, subpipe)
98+
continue
99+
elif hasattr(subpipe, 'transform'):
100+
substring = subpipe.transform(self.srtin)
68101
else:
69102
substring = subpipe
70103
self._scores.append((
@@ -84,4 +117,4 @@ def transform(self, *_):
84117
'--max-offset-seconds with a number larger than '
85118
'{}'.format(self.max_offset_seconds))
86119
(score, offset), subpipe = max(scores, key=lambda x: x[0][0])
87-
return offset, subpipe
120+
return (score, offset), subpipe

ffsubsync/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
DEFAULT_START_SECONDS = 0
1212
DEFAULT_SCALE_FACTOR = 1
1313
DEFAULT_VAD = 'subs_then_webrtc'
14-
DEFAULT_MAX_OFFSET_SECONDS = 600
14+
DEFAULT_MAX_OFFSET_SECONDS = 60
1515

1616
SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa', 'sub')
1717

ffsubsync/ffsubsync.py

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -77,21 +77,34 @@ def make_test_case(args, npy_savename, sync_was_successful):
7777

7878
def try_sync(args, reference_pipe, srt_pipes, result):
7979
sync_was_successful = True
80+
exc = None
8081
try:
81-
logger.info('extracting speech segments from subtitles file %s...', args.srtin)
82+
logger.info('extracting speech segments from %s...',
83+
'stdin' if args.srtin is None else 'subtitles file {}'.format(args.srtin))
8284
for srt_pipe in srt_pipes:
83-
srt_pipe.fit(args.srtin)
85+
if callable(srt_pipe):
86+
continue
87+
else:
88+
srt_pipe.fit(args.srtin)
8489
logger.info('...done')
8590
logger.info('computing alignments...')
86-
offset_samples, best_srt_pipe = MaxScoreAligner(
87-
FFTAligner, SAMPLE_RATE, args.max_offset_seconds
88-
).fit_transform(
89-
reference_pipe.transform(args.reference),
90-
srt_pipes,
91-
)
91+
if args.skip_sync:
92+
best_score = 0.
93+
best_srt_pipe = srt_pipes[0]
94+
if callable(best_srt_pipe):
95+
best_srt_pipe = best_srt_pipe(1.0).fit(args.srtin)
96+
offset_samples = 0
97+
else:
98+
(best_score, offset_samples), best_srt_pipe = MaxScoreAligner(
99+
FFTAligner, args.srtin, SAMPLE_RATE, args.max_offset_seconds
100+
).fit_transform(
101+
reference_pipe.transform(args.reference),
102+
srt_pipes,
103+
)
92104
logger.info('...done')
93105
offset_seconds = offset_samples / float(SAMPLE_RATE)
94106
scale_step = best_srt_pipe.named_steps['scale']
107+
logger.info('score: %.3f', best_score)
95108
logger.info('offset seconds: %.3f', offset_seconds)
96109
logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
97110
output_steps = [('shift', SubtitleShifter(offset_seconds))]
@@ -109,10 +122,16 @@ def try_sync(args, reference_pipe, srt_pipes, result):
109122
except FailedToFindAlignmentException as e:
110123
sync_was_successful = False
111124
logger.error(e)
125+
except Exception as e:
126+
exc = e
127+
sync_was_successful = False
128+
logger.error(e)
112129
else:
113130
result['offset_seconds'] = offset_seconds
114131
result['framerate_scale_factor'] = scale_step.scale_factor
115132
finally:
133+
if exc is not None:
134+
raise exc
116135
result['sync_was_successful'] = sync_was_successful
117136
return sync_was_successful
118137

@@ -158,10 +177,16 @@ def make_srt_pipes(args):
158177
if args.no_fix_framerate:
159178
framerate_ratios = [1.]
160179
else:
161-
framerate_ratios = np.concatenate([
180+
framerate_ratios = list(np.concatenate([
162181
[1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
163-
])
164-
parser = make_subtitle_parser(fmt=os.path.splitext(args.srtin)[-1][1:], caching=True, **args.__dict__)
182+
]))
183+
if args.gss:
184+
framerate_ratios.append(None)
185+
if args.srtin is None:
186+
srtin_format = 'srt'
187+
else:
188+
srtin_format = os.path.splitext(args.srtin)[-1][1:]
189+
parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__)
165190
srt_pipes = [
166191
make_subtitle_speech_pipeline(
167192
**override(args, scale_factor=scale_factor, parser=parser)
@@ -226,12 +251,13 @@ def validate_args(args):
226251

227252

228253
def validate_file_permissions(args):
254+
error_string_template = 'unable to {action} {file}; try ensuring file exists and has correct permissions'
229255
if not os.access(args.reference, os.R_OK):
230-
raise ValueError('unable to read reference %s (try checking permissions)' % args.reference)
231-
if not os.access(args.srtin, os.R_OK):
232-
raise ValueError('unable to read input subtitles %s (try checking permissions)' % args.srtin)
233-
if os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
234-
raise ValueError('unable to write output subtitles %s (try checking permissions)' % args.srtout)
256+
raise ValueError(error_string_template.format(action='read reference', file=args.reference))
257+
if args.srtin is not None and not os.access(args.srtin, os.R_OK):
258+
raise ValueError(error_string_template.format(action='read input subtitles', file=args.srtin))
259+
if args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
260+
raise ValueError(error_string_template.format(action='write output subtitles', file=args.srtout))
235261
if args.make_test_case or args.serialize_speech:
236262
npy_savename = os.path.splitext(args.reference)[0] + '.npz'
237263
if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK):
@@ -340,7 +366,7 @@ def add_cli_only_args(parser):
340366
parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS,
341367
help='Start time for processing '
342368
'(default=%d seconds).' % DEFAULT_START_SECONDS)
343-
parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS,
369+
parser.add_argument('--max-offset-seconds', type=float, default=DEFAULT_MAX_OFFSET_SECONDS,
344370
help='The max allowed offset seconds for any subtitle segment '
345371
'(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS)
346372
parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE,
@@ -372,6 +398,8 @@ def add_cli_only_args(parser):
372398
'directory).')
373399
parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS)
374400
parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS)
401+
parser.add_argument('--skip-sync', action='store_true', help=argparse.SUPPRESS)
402+
parser.add_argument('--gss', action='store_true', help=argparse.SUPPRESS)
375403

376404

377405
def make_parser():

ffsubsync/file_utils.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,11 @@ def __init__(self, filename, *args, **kwargs):
1313
if filename is None:
1414
stream = sys.stdout if 'w' in args else sys.stdin
1515
if six.PY3:
16-
self.closeable = open(stream.fileno(), *args, **kwargs)
17-
self.fh = self.closeable.buffer
16+
self.fh = open(stream.fileno(), *args, **kwargs)
1817
else:
19-
self.closeable = stream
20-
self.fh = self.closeable
18+
self.fh = stream
2119
elif isinstance(filename, six.string_types):
2220
self.fh = open(filename, *args, **kwargs)
23-
self.closeable = self.fh
2421
self.closing = True
2522
else:
2623
self.fh = filename
@@ -30,6 +27,6 @@ def __enter__(self):
3027

3128
def __exit__(self, exc_type, exc_val, exc_tb):
3229
if self.closing:
33-
self.closeable.close()
30+
self.fh.close()
3431

3532
return False

ffsubsync/golden_section_search.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Python program for golden section search (straight-up copied from Wikipedia).
2+
This implementation reuses function evaluations, saving 1/2 of the evaluations per
3+
iteration, and returns a bounding interval."""
4+
import logging
5+
import math
6+
7+
logging.basicConfig(level=logging.INFO)
8+
logger = logging.getLogger(__name__)
9+
10+
11+
invphi = (math.sqrt(5) - 1) / 2 # 1 / phi
12+
invphi2 = (3 - math.sqrt(5)) / 2 # 1 / phi^2
13+
14+
def gss(f, a, b, tol=1e-4):
15+
"""Golden-section search.
16+
17+
Given a function f with a single local minimum in
18+
the interval [a,b], gss returns a subset interval
19+
[c,d] that contains the minimum with d-c <= tol.
20+
21+
Example:
22+
>>> f = lambda x: (x-2)**2
23+
>>> a = 1
24+
>>> b = 5
25+
>>> tol = 1e-5
26+
>>> (c,d) = gss(f, a, b, tol)
27+
>>> print(c, d)
28+
1.9999959837979107 2.0000050911830893
29+
"""
30+
31+
(a, b) = (min(a, b), max(a, b))
32+
h = b - a
33+
if h <= tol:
34+
return a, b
35+
36+
# Required steps to achieve tolerance
37+
n = int(math.ceil(math.log(tol / h) / math.log(invphi)))
38+
logger.info('About to perform %d iterations of golden section search to find the best framerate', n)
39+
40+
def f_wrapped(x, is_last_iter):
41+
try:
42+
return f(x, is_last_iter)
43+
except TypeError:
44+
return f(x)
45+
46+
c = a + invphi2 * h
47+
d = a + invphi * h
48+
yc = f_wrapped(c, n==1)
49+
yd = f_wrapped(d, n==1)
50+
51+
for k in range(n-1):
52+
if yc < yd:
53+
b = d
54+
d = c
55+
yd = yc
56+
h = invphi * h
57+
c = a + invphi2 * h
58+
yc = f_wrapped(c, k==n-2)
59+
else:
60+
a = c
61+
c = d
62+
yc = yd
63+
h = invphi * h
64+
d = a + invphi * h
65+
yd = f(d, k==n-2)
66+
67+
if yc < yd:
68+
return a, d
69+
else:
70+
return c, b

ffsubsync/speech_transformers.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,20 @@ def make_subtitle_speech_pipeline(
4242
assert parser.encoding == encoding
4343
assert parser.max_subtitle_seconds == max_subtitle_seconds
4444
assert parser.start_seconds == start_seconds
45-
return Pipeline([
46-
('parse', parser),
47-
('scale', SubtitleScaler(scale_factor)),
48-
('speech_extract', SubtitleSpeechTransformer(
49-
sample_rate=SAMPLE_RATE,
50-
start_seconds=start_seconds,
51-
framerate_ratio=scale_factor,
52-
))
53-
])
45+
def subpipe_maker(framerate_ratio):
46+
return Pipeline([
47+
('parse', parser),
48+
('scale', SubtitleScaler(framerate_ratio)),
49+
('speech_extract', SubtitleSpeechTransformer(
50+
sample_rate=SAMPLE_RATE,
51+
start_seconds=start_seconds,
52+
framerate_ratio=framerate_ratio,
53+
))
54+
])
55+
if scale_factor is None:
56+
return subpipe_maker
57+
else:
58+
return subpipe_maker(scale_factor)
5459

5560

5661
def _make_auditok_detector(sample_rate, frame_rate):

ffsubsync/subtitle_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def __init__(self, fmt='srt', encoding='infer', caching=False, max_subtitle_seco
7676
self.start_seconds = start_seconds
7777

7878
def fit(self, fname, *_):
79-
if self.caching and self.fit_fname == fname:
79+
if self.caching and self.fit_fname == ('<stdin>' if fname is None else fname):
8080
return self
8181
encodings_to_try = (self.encoding,)
8282
with open_file(fname, 'rb') as f:
@@ -102,7 +102,7 @@ def fit(self, fname, *_):
102102
sub_format=self.sub_format,
103103
encoding=encoding
104104
)
105-
self.fit_fname = fname
105+
self.fit_fname = '<stdin>' if fname is None else fname
106106
if len(encodings_to_try) > 1:
107107
self.detected_encoding_ = encoding
108108
logger.info('detected encoding: %s' % self.detected_encoding_)

tests/test_alignment.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
])
1111
def test_fft_alignment(s1, s2, true_offset):
1212
assert FFTAligner().fit_transform(s2, s1) == true_offset
13-
assert MaxScoreAligner(FFTAligner).fit_transform(s2, s1)[0] == true_offset
14-
assert MaxScoreAligner(FFTAligner()).fit_transform(s2, s1)[0] == true_offset
13+
assert MaxScoreAligner(FFTAligner).fit_transform(s2, s1)[0][1] == true_offset
14+
assert MaxScoreAligner(FFTAligner()).fit_transform(s2, s1)[0][1] == true_offset

0 commit comments

Comments
 (0)