Skip to content

Commit f4bcc9f

Browse files
committed
Add best_action_window_start to optimize start times for audio-intense short scenes
- Implement `best_action_window_start` to determine the ideal start time for maximizing audio action intensity within a given window. - Update `process_video` to utilize this function for improved start time selection in short generation. - Add corresponding tests to validate the functionality with various scenarios.
1 parent 9fddad8 commit f4bcc9f

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

shorts.py

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,68 @@ def scene_action_score(
200200
return float(segment_scores.mean())
201201

202202

203+
def best_action_window_start(
204+
scene: Tuple,
205+
window_length: float,
206+
times: np.ndarray,
207+
score: np.ndarray,
208+
) -> float:
209+
"""Pick the start time inside ``scene`` where the audio action score
210+
summed over ``window_length`` seconds is maximal.
211+
212+
If there is not enough information to compute a reliable window (e.g.,
213+
no audio frames inside the scene), fall back to the scene start.
214+
"""
215+
216+
start_sec = float(scene[0].get_seconds())
217+
end_sec = float(scene[1].get_seconds())
218+
219+
# Safety clamp if durations are degenerate
220+
if not math.isfinite(start_sec) or not math.isfinite(end_sec) or end_sec <= start_sec:
221+
return start_sec
222+
223+
# We only consider windows that fully fit into the scene
224+
max_allowed_start = end_sec - float(window_length)
225+
if max_allowed_start <= start_sec:
226+
# Window must start exactly at scene start (scene ~= window length)
227+
return max(start_sec, min(start_sec, end_sec - float(window_length)))
228+
229+
# Identify audio feature frames within the scene
230+
mask = (times >= start_sec) & (times <= end_sec)
231+
if not np.any(mask):
232+
return start_sec
233+
234+
t_seg = times[mask]
235+
s_seg = score[mask]
236+
237+
if len(t_seg) < 2:
238+
# Only a single frame inside the scene; start at scene start
239+
return start_sec
240+
241+
# Estimate frame step (should be constant for librosa frames)
242+
dt = float(np.median(np.diff(t_seg)))
243+
if not math.isfinite(dt) or dt <= 0:
244+
return start_sec
245+
246+
# Convert window length in seconds to frames
247+
n_win = int(max(1, round(float(window_length) / dt)))
248+
249+
if len(s_seg) < n_win:
250+
# Not enough frames sampled inside the scene; start at scene start
251+
return start_sec
252+
253+
# Moving sum over the window using cumulative sum for efficiency
254+
csum = np.cumsum(np.concatenate(([0.0], s_seg)))
255+
window_sums = csum[n_win:] - csum[:-n_win] # shape: (len(s_seg) - n_win + 1,)
256+
best_idx = int(np.argmax(window_sums))
257+
258+
# Map best index back to absolute time and clamp inside [start, end - window]
259+
best_start_time = float(t_seg[best_idx])
260+
best_start_time = max(start_sec, min(best_start_time, max_allowed_start))
261+
262+
return best_start_time
263+
264+
203265
def crop_clip(
204266
clip: VideoFileClip,
205267
ratio_w: int,
@@ -541,12 +603,24 @@ def process_video(video_file: Path, config: ProcessingConfig, output_dir: Path)
541603
config.min_short_length, min(config.max_short_length, duration)
542604
)
543605

544-
min_start = math.floor(scene[0].get_seconds())
545-
max_start = math.floor(scene[1].get_seconds() - short_length)
606+
# Pick the start time that maximizes the cumulative audio action
607+
# within the chosen short_length window for this scene.
608+
best_start = best_action_window_start(
609+
scene,
610+
float(short_length),
611+
audio_times,
612+
audio_score,
613+
)
614+
logging.info(
615+
"Selected start %.2f for scene %d with window %ds",
616+
best_start,
617+
i,
618+
short_length,
619+
)
546620

547621
final_clip = get_final_clip(
548622
video_clip,
549-
random.randint(min_start, max_start),
623+
best_start,
550624
short_length,
551625
config,
552626
)

tests/test_shorts.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
ProcessingConfig,
3131
render_video,
3232
scene_action_score,
33+
best_action_window_start,
3334
compute_audio_action_profile,
3435
)
3536

@@ -170,3 +171,45 @@ def frames_to_time(frames, sr=100, hop_length=512):
170171
assert len(times) == len(score) == 3
171172
# Combined score should not be constant given our stub inputs
172173
assert score.std() > 0
174+
175+
176+
177+
def test_best_action_window_start_picks_max_window():
178+
# times every 1s from 0..19
179+
times = np.arange(0.0, 20.0, 1.0, dtype=float)
180+
score = np.zeros_like(times)
181+
# Low action at 2..4
182+
score[2:5] = 1.0
183+
# High action at 8..10 — the best 3s window should start at 8
184+
score[8:11] = 2.0
185+
186+
scene = make_scene(0.0, 15.0)
187+
start = best_action_window_start(scene, 3.0, times, score)
188+
assert start == pytest.approx(8.0, rel=1e-9)
189+
190+
191+
def test_best_action_window_start_clamps_to_fit():
192+
# Increasing scores push the best window to the end, but it must clamp to fit
193+
times = np.arange(0.0, 6.0, 1.0, dtype=float)
194+
score = np.arange(len(times), dtype=float) # 0,1,2,3,4,5
195+
196+
scene = make_scene(0.0, 5.0)
197+
# Window 4s can only start in [0, 1]; the raw best start would be 2 -> clamp to 1
198+
start = best_action_window_start(scene, 4.0, times, score)
199+
assert start == pytest.approx(1.0, rel=1e-9)
200+
201+
202+
def test_best_action_window_start_fallback_no_frames():
203+
times = np.arange(100.0, 110.0, 1.0, dtype=float)
204+
score = np.ones_like(times)
205+
scene = make_scene(0.0, 5.0)
206+
start = best_action_window_start(scene, 3.0, times, score)
207+
assert start == pytest.approx(0.0, rel=1e-9)
208+
209+
210+
def test_best_action_window_start_short_scene():
211+
times = np.arange(0.0, 50.0, 1.0, dtype=float)
212+
score = np.ones_like(times)
213+
scene = make_scene(10.0, 12.0) # duration 2s
214+
start = best_action_window_start(scene, 5.0, times, score)
215+
assert start == pytest.approx(10.0, rel=1e-9)

0 commit comments

Comments
 (0)