experimental golden section search; change max offset seconds to 60; misc bugfixes for 0.4.9 release

smacke · smacke · commit 6975d36ccc67 · 2020-10-11T12:21:18.000-07:00
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -165,3 +165,10 @@ History
 0.4.8 (2020-09-22)
 ------------------
 * Use webrtcvad-wheels on Windows to eliminate dependency on compiler;
+
+0.4.9 (2020-10-11)
+------------------
+* Make default offset seconds 60 and enforce during alignment as opposed to throwing away alignments with > max_offset_seconds;
+* Add experimental section for using golden section search to find framerate ratio;
+* Restore ability to read stdin and write stdout after buggy permissions check;
+* Exceptions that occur during syncing were mistakenly suppressed; this is now fixed;
diff --git a/ffsubsync/aligners.py b/ffsubsync/aligners.py
@@ -3,6 +3,9 @@
 import math
 
 import numpy as np
+
+from .constants import FRAMERATE_RATIOS
+from .golden_section_search import gss
 from .sklearn_shim import TransformerMixin
 
 logging.basicConfig(level=logging.INFO)
@@ -14,11 +17,25 @@ class FailedToFindAlignmentException(Exception):
 
 
 class FFTAligner(TransformerMixin):
-    def __init__(self):
+    def __init__(self, max_offset_samples=None):
+        self.max_offset_samples = max_offset_samples
         self.best_offset_ = None
         self.best_score_ = None
         self.get_score_ = False
 
+    def _zero_out_extreme_offsets(self, convolve, substring):
+        convolve = np.copy(convolve)
+        if self.max_offset_samples is None:
+            return convolve
+        offset_to_index = lambda offset: len(convolve) - 1 + offset - len(substring)
+        convolve[:offset_to_index(-self.max_offset_samples)] = convolve[offset_to_index(self.max_offset_samples):] = 0
+        return convolve
+
+    def _compute_argmax(self, convolve, substring):
+        best_idx = np.argmax(convolve)
+        self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
+        self.best_score_ = convolve[best_idx]
+
     def fit(self, refstring, substring, get_score=False):
         refstring, substring = [
             list(map(int, s))
@@ -33,9 +50,9 @@ def fit(self, refstring, substring, get_score=False):
         subft = np.fft.fft(np.append(np.zeros(extra_zeros + len(refstring)), substring))
         refft = np.fft.fft(np.flip(np.append(refstring, np.zeros(len(substring) + extra_zeros)), 0))
         convolve = np.real(np.fft.ifft(subft * refft))
-        best_idx = np.argmax(convolve)
-        self.best_offset_ = len(convolve) - 1 - best_idx - len(substring)
-        self.best_score_ = convolve[best_idx]
+        self._compute_argmax(self._zero_out_extreme_offsets(convolve, substring), substring)
+        if self.best_score_ == 0.:
+            self._compute_argmax(convolve, substring)
         self.get_score_ = get_score
         return self
 
@@ -47,24 +64,40 @@ def transform(self, *_):
 
 
 class MaxScoreAligner(TransformerMixin):
-    def __init__(self, base_aligner, sample_rate=None, max_offset_seconds=None):
+    def __init__(self, base_aligner, srtin=None, sample_rate=None, max_offset_seconds=None):
+        self.srtin = srtin
+        if sample_rate is None or max_offset_seconds is None:
+            self.max_offset_samples = None
+        else:
+            self.max_offset_samples = abs(int(max_offset_seconds * sample_rate))
         if isinstance(base_aligner, type):
-            self.base_aligner = base_aligner()
+            self.base_aligner = base_aligner(max_offset_samples=self.max_offset_samples)
         else:
             self.base_aligner = base_aligner
         self.max_offset_seconds = max_offset_seconds
-        if sample_rate is None or max_offset_seconds is None:
-            self.max_offset_samples = None
-        else:
-            self.max_offset_samples = abs(max_offset_seconds * sample_rate)
         self._scores = []
 
+    def fit_gss(self, refstring, subpipe_maker):
+        def opt_func(framerate_ratio, is_last_iter):
+            subpipe = subpipe_maker(framerate_ratio)
+            substring = subpipe.fit_transform(self.srtin)
+            score = self.base_aligner.fit_transform(refstring, substring, get_score=True)
+            logger.info('got score %.0f (offset %d) for ratio %.3f', score[0], score[1], framerate_ratio)
+            if is_last_iter:
+                self._scores.append((score, subpipe))
+            return -score[0]
+        gss(opt_func, 0.9, 1.1)
+        return self
+
     def fit(self, refstring, subpipes):
         if not isinstance(subpipes, list):
             subpipes = [subpipes]
         for subpipe in subpipes:
-            if hasattr(subpipe, 'transform'):
-                substring = subpipe.transform(None)
+            if callable(subpipe):
+                self.fit_gss(refstring, subpipe)
+                continue
+            elif hasattr(subpipe, 'transform'):
+                substring = subpipe.transform(self.srtin)
             else:
                 substring = subpipe
             self._scores.append((
@@ -84,4 +117,4 @@ def transform(self, *_):
                                                  '--max-offset-seconds with a number larger than '
                                                  '{}'.format(self.max_offset_seconds))
         (score, offset), subpipe = max(scores, key=lambda x: x[0][0])
-        return offset, subpipe
+        return (score, offset), subpipe
diff --git a/ffsubsync/constants.py b/ffsubsync/constants.py
@@ -11,7 +11,7 @@
 DEFAULT_START_SECONDS = 0
 DEFAULT_SCALE_FACTOR = 1
 DEFAULT_VAD = 'subs_then_webrtc'
-DEFAULT_MAX_OFFSET_SECONDS = 600
+DEFAULT_MAX_OFFSET_SECONDS = 60
 
 SUBTITLE_EXTENSIONS = ('srt', 'ass', 'ssa', 'sub')
 
diff --git a/ffsubsync/ffsubsync.py b/ffsubsync/ffsubsync.py
@@ -77,21 +77,34 @@ def make_test_case(args, npy_savename, sync_was_successful):
 
 def try_sync(args, reference_pipe, srt_pipes, result):
     sync_was_successful = True
+    exc = None
     try:
-        logger.info('extracting speech segments from subtitles file %s...', args.srtin)
+        logger.info('extracting speech segments from %s...',
+                    'stdin' if args.srtin is None else 'subtitles file {}'.format(args.srtin))
         for srt_pipe in srt_pipes:
-            srt_pipe.fit(args.srtin)
+            if callable(srt_pipe):
+                continue
+            else:
+                srt_pipe.fit(args.srtin)
         logger.info('...done')
         logger.info('computing alignments...')
-        offset_samples, best_srt_pipe = MaxScoreAligner(
-            FFTAligner, SAMPLE_RATE, args.max_offset_seconds
-        ).fit_transform(
-            reference_pipe.transform(args.reference),
-            srt_pipes,
-        )
+        if args.skip_sync:
+            best_score = 0.
+            best_srt_pipe = srt_pipes[0]
+            if callable(best_srt_pipe):
+                best_srt_pipe = best_srt_pipe(1.0).fit(args.srtin)
+            offset_samples = 0
+        else:
+            (best_score, offset_samples), best_srt_pipe = MaxScoreAligner(
+                FFTAligner, args.srtin, SAMPLE_RATE, args.max_offset_seconds
+            ).fit_transform(
+                reference_pipe.transform(args.reference),
+                srt_pipes,
+            )
         logger.info('...done')
         offset_seconds = offset_samples / float(SAMPLE_RATE)
         scale_step = best_srt_pipe.named_steps['scale']
+        logger.info('score: %.3f', best_score)
         logger.info('offset seconds: %.3f', offset_seconds)
         logger.info('framerate scale factor: %.3f', scale_step.scale_factor)
         output_steps = [('shift', SubtitleShifter(offset_seconds))]
@@ -109,10 +122,16 @@ def try_sync(args, reference_pipe, srt_pipes, result):
     except FailedToFindAlignmentException as e:
         sync_was_successful = False
         logger.error(e)
+    except Exception as e:
+        exc = e
+        sync_was_successful = False
+        logger.error(e)
     else:
         result['offset_seconds'] = offset_seconds
         result['framerate_scale_factor'] = scale_step.scale_factor
     finally:
+        if exc is not None:
+            raise exc
         result['sync_was_successful'] = sync_was_successful
         return sync_was_successful
 
@@ -158,10 +177,16 @@ def make_srt_pipes(args):
     if args.no_fix_framerate:
         framerate_ratios = [1.]
     else:
-        framerate_ratios = np.concatenate([
+        framerate_ratios = list(np.concatenate([
             [1.], np.array(FRAMERATE_RATIOS), 1./np.array(FRAMERATE_RATIOS)
-        ])
-    parser = make_subtitle_parser(fmt=os.path.splitext(args.srtin)[-1][1:], caching=True, **args.__dict__)
+        ]))
+        if args.gss:
+            framerate_ratios.append(None)
+    if args.srtin is None:
+        srtin_format = 'srt'
+    else:
+        srtin_format = os.path.splitext(args.srtin)[-1][1:]
+    parser = make_subtitle_parser(fmt=srtin_format, caching=True, **args.__dict__)
     srt_pipes = [
         make_subtitle_speech_pipeline(
             **override(args, scale_factor=scale_factor, parser=parser)
@@ -226,12 +251,13 @@ def validate_args(args):
 
 
 def validate_file_permissions(args):
+    error_string_template = 'unable to {action} {file}; try ensuring file exists and has correct permissions'
     if not os.access(args.reference, os.R_OK):
-        raise ValueError('unable to read reference %s (try checking permissions)' % args.reference)
-    if not os.access(args.srtin, os.R_OK):
-        raise ValueError('unable to read input subtitles %s (try checking permissions)' % args.srtin)
-    if os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
-        raise ValueError('unable to write output subtitles %s (try checking permissions)' % args.srtout)
+        raise ValueError(error_string_template.format(action='read reference', file=args.reference))
+    if args.srtin is not None and not os.access(args.srtin, os.R_OK):
+        raise ValueError(error_string_template.format(action='read input subtitles', file=args.srtin))
+    if args.srtout is not None and os.path.exists(args.srtout) and not os.access(args.srtout, os.W_OK):
+        raise ValueError(error_string_template.format(action='write output subtitles', file=args.srtout))
     if args.make_test_case or args.serialize_speech:
         npy_savename = os.path.splitext(args.reference)[0] + '.npz'
         if os.path.exists(npy_savename) and not os.access(npy_savename, os.W_OK):
@@ -340,7 +366,7 @@ def add_cli_only_args(parser):
     parser.add_argument('--start-seconds', type=int, default=DEFAULT_START_SECONDS,
                         help='Start time for processing '
                              '(default=%d seconds).' % DEFAULT_START_SECONDS)
-    parser.add_argument('--max-offset-seconds', type=int, default=DEFAULT_MAX_OFFSET_SECONDS,
+    parser.add_argument('--max-offset-seconds', type=float, default=DEFAULT_MAX_OFFSET_SECONDS,
                         help='The max allowed offset seconds for any subtitle segment '
                              '(default=%d seconds).' % DEFAULT_MAX_OFFSET_SECONDS)
     parser.add_argument('--frame-rate', type=int, default=DEFAULT_FRAME_RATE,
@@ -372,6 +398,8 @@ def add_cli_only_args(parser):
                         'directory).')
     parser.add_argument('--vlc-mode', action='store_true', help=argparse.SUPPRESS)
     parser.add_argument('--gui-mode', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--skip-sync', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--gss', action='store_true', help=argparse.SUPPRESS)
 
 
 def make_parser():
diff --git a/ffsubsync/file_utils.py b/ffsubsync/file_utils.py
@@ -13,14 +13,11 @@ def __init__(self, filename, *args, **kwargs):
         if filename is None:
             stream = sys.stdout if 'w' in args else sys.stdin
             if six.PY3:
-                self.closeable = open(stream.fileno(), *args, **kwargs)
-                self.fh = self.closeable.buffer
+                self.fh = open(stream.fileno(), *args, **kwargs)
             else:
-                self.closeable = stream
-                self.fh = self.closeable
+                self.fh = stream
         elif isinstance(filename, six.string_types):
             self.fh = open(filename, *args, **kwargs)
-            self.closeable = self.fh
             self.closing = True
         else:
             self.fh = filename
@@ -30,6 +27,6 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self.closing:
-            self.closeable.close()
+            self.fh.close()
 
         return False
diff --git a/ffsubsync/golden_section_search.py b/ffsubsync/golden_section_search.py
@@ -0,0 +1,70 @@
+"""Python program for golden section search (straight-up copied from Wikipedia).
+   This implementation reuses function evaluations, saving 1/2 of the evaluations per
+   iteration, and returns a bounding interval."""
+import logging
+import math
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+invphi = (math.sqrt(5) - 1) / 2  # 1 / phi
+invphi2 = (3 - math.sqrt(5)) / 2  # 1 / phi^2
+
+def gss(f, a, b, tol=1e-4):
+    """Golden-section search.
+
+    Given a function f with a single local minimum in
+    the interval [a,b], gss returns a subset interval
+    [c,d] that contains the minimum with d-c <= tol.
+
+    Example:
+    >>> f = lambda x: (x-2)**2
+    >>> a = 1
+    >>> b = 5
+    >>> tol = 1e-5
+    >>> (c,d) = gss(f, a, b, tol)
+    >>> print(c, d)
+    1.9999959837979107 2.0000050911830893
+    """
+
+    (a, b) = (min(a, b), max(a, b))
+    h = b - a
+    if h <= tol:
+        return a, b
+
+    # Required steps to achieve tolerance
+    n = int(math.ceil(math.log(tol / h) / math.log(invphi)))
+    logger.info('About to perform %d iterations of golden section search to find the best framerate', n)
+
+    def f_wrapped(x, is_last_iter):
+        try:
+            return f(x, is_last_iter)
+        except TypeError:
+            return f(x)
+
+    c = a + invphi2 * h
+    d = a + invphi * h
+    yc = f_wrapped(c, n==1)
+    yd = f_wrapped(d, n==1)
+
+    for k in range(n-1):
+        if yc < yd:
+            b = d
+            d = c
+            yd = yc
+            h = invphi * h
+            c = a + invphi2 * h
+            yc = f_wrapped(c, k==n-2)
+        else:
+            a = c
+            c = d
+            yc = yd
+            h = invphi * h
+            d = a + invphi * h
+            yd = f(d, k==n-2)
+
+    if yc < yd:
+        return a, d
+    else:
+        return c, b
diff --git a/ffsubsync/speech_transformers.py b/ffsubsync/speech_transformers.py
@@ -42,15 +42,20 @@ def make_subtitle_speech_pipeline(
     assert parser.encoding == encoding
     assert parser.max_subtitle_seconds == max_subtitle_seconds
     assert parser.start_seconds == start_seconds
-    return Pipeline([
-        ('parse', parser),
-        ('scale', SubtitleScaler(scale_factor)),
-        ('speech_extract', SubtitleSpeechTransformer(
-            sample_rate=SAMPLE_RATE,
-            start_seconds=start_seconds,
-            framerate_ratio=scale_factor,
-        ))
-    ])
+    def subpipe_maker(framerate_ratio):
+        return Pipeline([
+            ('parse', parser),
+            ('scale', SubtitleScaler(framerate_ratio)),
+            ('speech_extract', SubtitleSpeechTransformer(
+                sample_rate=SAMPLE_RATE,
+                start_seconds=start_seconds,
+                framerate_ratio=framerate_ratio,
+            ))
+        ])
+    if scale_factor is None:
+        return subpipe_maker
+    else:
+        return subpipe_maker(scale_factor)
 
 
 def _make_auditok_detector(sample_rate, frame_rate):
diff --git a/ffsubsync/subtitle_parser.py b/ffsubsync/subtitle_parser.py
@@ -76,7 +76,7 @@ def __init__(self, fmt='srt', encoding='infer', caching=False, max_subtitle_seco
         self.start_seconds = start_seconds
 
     def fit(self, fname, *_):
-        if self.caching and self.fit_fname == fname:
+        if self.caching and self.fit_fname == ('<stdin>' if fname is None else fname):
             return self
         encodings_to_try = (self.encoding,)
         with open_file(fname, 'rb') as f:
@@ -102,7 +102,7 @@ def fit(self, fname, *_):
                     sub_format=self.sub_format,
                     encoding=encoding
                 )
-                self.fit_fname = fname
+                self.fit_fname = '<stdin>' if fname is None else fname
                 if len(encodings_to_try) > 1:
                     self.detected_encoding_ = encoding
                     logger.info('detected encoding: %s' % self.detected_encoding_)
diff --git a/tests/test_alignment.py b/tests/test_alignment.py
@@ -10,5 +10,5 @@
 ])
 def test_fft_alignment(s1, s2, true_offset):
     assert FFTAligner().fit_transform(s2, s1) == true_offset
-    assert MaxScoreAligner(FFTAligner).fit_transform(s2, s1)[0] == true_offset
-    assert MaxScoreAligner(FFTAligner()).fit_transform(s2, s1)[0] == true_offset
+    assert MaxScoreAligner(FFTAligner).fit_transform(s2, s1)[0][1] == true_offset
+    assert MaxScoreAligner(FFTAligner()).fit_transform(s2, s1)[0][1] == true_offset