Add best_action_window_start to optimize start times for audio-intense short scenes

artryazanov · artryazanov · commit f4bcc9f17f68 · 2025-11-22T02:13:44.000+07:00
- Implement `best_action_window_start` to determine the ideal start time for maximizing audio action intensity within a given window.
- Update `process_video` to utilize this function for improved start time selection in short generation.
- Add corresponding tests to validate the functionality with various scenarios.
diff --git a/shorts.py b/shorts.py
@@ -200,6 +200,68 @@ def scene_action_score(
     return float(segment_scores.mean())
 
 
+def best_action_window_start(
+    scene: Tuple,
+    window_length: float,
+    times: np.ndarray,
+    score: np.ndarray,
+) -> float:
+    """Pick the start time inside ``scene`` where the audio action score
+    summed over ``window_length`` seconds is maximal.
+
+    If there is not enough information to compute a reliable window (e.g.,
+    no audio frames inside the scene), fall back to the scene start.
+    """
+
+    start_sec = float(scene[0].get_seconds())
+    end_sec = float(scene[1].get_seconds())
+
+    # Safety clamp if durations are degenerate
+    if not math.isfinite(start_sec) or not math.isfinite(end_sec) or end_sec <= start_sec:
+        return start_sec
+
+    # We only consider windows that fully fit into the scene
+    max_allowed_start = end_sec - float(window_length)
+    if max_allowed_start <= start_sec:
+        # Window must start exactly at scene start (scene ~= window length)
+        return max(start_sec, min(start_sec, end_sec - float(window_length)))
+
+    # Identify audio feature frames within the scene
+    mask = (times >= start_sec) & (times <= end_sec)
+    if not np.any(mask):
+        return start_sec
+
+    t_seg = times[mask]
+    s_seg = score[mask]
+
+    if len(t_seg) < 2:
+        # Only a single frame inside the scene; start at scene start
+        return start_sec
+
+    # Estimate frame step (should be constant for librosa frames)
+    dt = float(np.median(np.diff(t_seg)))
+    if not math.isfinite(dt) or dt <= 0:
+        return start_sec
+
+    # Convert window length in seconds to frames
+    n_win = int(max(1, round(float(window_length) / dt)))
+
+    if len(s_seg) < n_win:
+        # Not enough frames sampled inside the scene; start at scene start
+        return start_sec
+
+    # Moving sum over the window using cumulative sum for efficiency
+    csum = np.cumsum(np.concatenate(([0.0], s_seg)))
+    window_sums = csum[n_win:] - csum[:-n_win]  # shape: (len(s_seg) - n_win + 1,)
+    best_idx = int(np.argmax(window_sums))
+
+    # Map best index back to absolute time and clamp inside [start, end - window]
+    best_start_time = float(t_seg[best_idx])
+    best_start_time = max(start_sec, min(best_start_time, max_allowed_start))
+
+    return best_start_time
+
+
 def crop_clip(
     clip: VideoFileClip,
     ratio_w: int,
@@ -541,12 +603,24 @@ def process_video(video_file: Path, config: ProcessingConfig, output_dir: Path)
                 config.min_short_length, min(config.max_short_length, duration)
             )
 
-            min_start = math.floor(scene[0].get_seconds())
-            max_start = math.floor(scene[1].get_seconds() - short_length)
+            # Pick the start time that maximizes the cumulative audio action
+            # within the chosen short_length window for this scene.
+            best_start = best_action_window_start(
+                scene,
+                float(short_length),
+                audio_times,
+                audio_score,
+            )
+            logging.info(
+                "Selected start %.2f for scene %d with window %ds",
+                best_start,
+                i,
+                short_length,
+            )
 
             final_clip = get_final_clip(
                 video_clip,
-                random.randint(min_start, max_start),
+                best_start,
                 short_length,
                 config,
             )
diff --git a/tests/test_shorts.py b/tests/test_shorts.py
@@ -30,6 +30,7 @@
     ProcessingConfig,
     render_video,
     scene_action_score,
+    best_action_window_start,
     compute_audio_action_profile,
 )
 
@@ -170,3 +171,45 @@ def frames_to_time(frames, sr=100, hop_length=512):
     assert len(times) == len(score) == 3
     # Combined score should not be constant given our stub inputs
     assert score.std() > 0
+
+
+
+def test_best_action_window_start_picks_max_window():
+    # times every 1s from 0..19
+    times = np.arange(0.0, 20.0, 1.0, dtype=float)
+    score = np.zeros_like(times)
+    # Low action at 2..4
+    score[2:5] = 1.0
+    # High action at 8..10 — the best 3s window should start at 8
+    score[8:11] = 2.0
+
+    scene = make_scene(0.0, 15.0)
+    start = best_action_window_start(scene, 3.0, times, score)
+    assert start == pytest.approx(8.0, rel=1e-9)
+
+
+def test_best_action_window_start_clamps_to_fit():
+    # Increasing scores push the best window to the end, but it must clamp to fit
+    times = np.arange(0.0, 6.0, 1.0, dtype=float)
+    score = np.arange(len(times), dtype=float)  # 0,1,2,3,4,5
+
+    scene = make_scene(0.0, 5.0)
+    # Window 4s can only start in [0, 1]; the raw best start would be 2 -> clamp to 1
+    start = best_action_window_start(scene, 4.0, times, score)
+    assert start == pytest.approx(1.0, rel=1e-9)
+
+
+def test_best_action_window_start_fallback_no_frames():
+    times = np.arange(100.0, 110.0, 1.0, dtype=float)
+    score = np.ones_like(times)
+    scene = make_scene(0.0, 5.0)
+    start = best_action_window_start(scene, 3.0, times, score)
+    assert start == pytest.approx(0.0, rel=1e-9)
+
+
+def test_best_action_window_start_short_scene():
+    times = np.arange(0.0, 50.0, 1.0, dtype=float)
+    score = np.ones_like(times)
+    scene = make_scene(10.0, 12.0)  # duration 2s
+    start = best_action_window_start(scene, 5.0, times, score)
+    assert start == pytest.approx(10.0, rel=1e-9)