Fix video and audio going out of sync (#25)

rikhuijzer · web-flow · commit 8e47ea3e3aa9 · 2025-03-07T19:16:48.000+01:00
This problem might have been there since the start, but I only noticed it with a longer video. Closes #24.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,5 @@ _out/
 tests/_cache_out/
 tests/_compatible_out/
 tests/_google_out/
+tests/_duration_matches_out/
 
diff --git a/src/video.rs b/src/video.rs
@@ -175,12 +175,19 @@ pub(crate) fn combine_video(dir: &str, slides: &Vec<Slide>, output: &str, audio_
         .arg("concat")
         .arg("-i")
         .arg(concat_list)
-        .arg("-c")
-        .arg("copy")
+        .arg("-c:v")
+        // Re-encode to ensure video can be trimmed.
+        .arg("libx264")
         .arg("-c:a")
         .arg(audio_codec)
+        // Experimental is required for opus.
         .arg("-strict")
         .arg("experimental")
+        // Aresample in attempt to fix audio sync.
+        .arg("-af")
+        .arg("aresample=async=1")
+        // To avoid pauses at the end of the video.
+        .arg("-shortest")
         .arg(output_path)
         .output()
         .expect("Failed to run ffmpeg command");
diff --git a/tests/cli.rs b/tests/cli.rs
@@ -159,3 +159,89 @@ fn google_provider() -> Result<(), Box<dyn std::error::Error>> {
 
     Ok(())
 }
+
+fn convert_to_mp3(input: &str, output: &str) {
+    let output = std::process::Command::new("ffmpeg")
+        .arg("-y")
+        .arg("-i")
+        .arg(input)
+        .arg("-q:a")
+        .arg("0")
+        .arg("-map")
+        .arg("a")
+        .arg(output)
+        .output()
+        .expect("Failed to run ffmpeg command");
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        tracing::error!("Failed to convert to mp3: {stderr}");
+        std::process::exit(1);
+    } else {
+        tracing::info!("Converted to mp3");
+    }
+}
+
+fn probe_duration(path: &str) -> Option<String> {
+    let output = std::process::Command::new("ffprobe")
+        .arg("-i")
+        .arg(path)
+        .output()
+        .expect("Failed to run ffprobe command");
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        println!("Failed to probe duration: {stderr}");
+        return None;
+    }
+    let stderr = String::from_utf8_lossy(&output.stderr);
+    let duration = stderr
+        .split("Duration: ")
+        .nth(1)
+        .unwrap()
+        .split(",")
+        .next()
+        .unwrap();
+    Some(duration.to_string())
+}
+
+fn duration_as_seconds(duration: &str) -> f32 {
+    let parts = duration.split(":").collect::<Vec<&str>>();
+    let hours = parts[0].parse::<f32>().unwrap();
+    let minutes = parts[1].parse::<f32>().unwrap();
+    let seconds = parts[2].parse::<f32>().unwrap();
+    hours * 3600.0 + minutes * 60.0 + seconds
+}
+
+#[test]
+fn test_duration_matches() -> Result<(), Box<dyn std::error::Error>> {
+    let out_dir = Path::new("tests").join("_duration_matches_out");
+    let out_dir = out_dir.to_str().unwrap();
+    println!("out_dir: {out_dir}");
+
+    let key = common::load_key(&Provider::Google);
+    let mut cmd = bin();
+    cmd.env("GOOGLE_KEY", key);
+    cmd.arg(format!("--out-dir={}", out_dir));
+    cmd.arg("--verbose");
+    cmd.arg("--cache=false");
+    cmd.arg("build");
+    cmd.arg("tests/test_duration_matches.typ");
+    cmd.assert().success();
+
+    let video_path = Path::new(out_dir).join("out.mp4");
+    let video_path = video_path.to_str().unwrap();
+    let audio_path = Path::new(out_dir).join("out.mp3");
+    let audio_path = audio_path.to_str().unwrap();
+    convert_to_mp3(&video_path, &audio_path);
+
+    let video_duration = probe_duration(&video_path).unwrap();
+    println!("video_duration: {video_duration}");
+    let video_duration = duration_as_seconds(&video_duration);
+    println!("video_duration: {video_duration} seconds");
+    let audio_duration = probe_duration(&audio_path).unwrap();
+    println!("audio_duration: {audio_duration}");
+    let audio_duration = duration_as_seconds(&audio_duration);
+    println!("audio_duration: {audio_duration} seconds");
+    assert!(video_duration - audio_duration < 0.1);
+
+    Ok(())
+}
diff --git a/tests/test_duration_matches.typ b/tests/test_duration_matches.typ
@@ -0,0 +1,80 @@
+#import "@preview/polylux:0.4.0": *
+
+#set page(paper: "presentation-16-9")
+#set text(size: 25pt)
+
+// --- trv config:
+// provider = "google"
+// voice = "en-US-Chirp-HD-D"
+// language_code = "en-US"
+// ---
+
+#slide[
+    first
+
+    #toolbox.pdfpc.speaker-note("
+      OpenAI whisper is a tool that can be used to run speech recognition.
+
+      It's a great tool for generating SRT files.
+      
+      In this video, I'll quickly show you how to use it.
+    ")
+]
+
+#slide[
+    second
+
+    #toolbox.pdfpc.speaker-note("
+      To install OpenAI whisper, there are multiple options.
+
+      OpenAI advices to use pip install, but on MacOS it's probably easier to use Homebrew.
+
+      Note that this installation might take a while.
+
+      In case of problems during installation, see the openai whisper repository on GitHub.
+    ")
+]
+
+#slide[
+    third
+
+    #toolbox.pdfpc.speaker-note("
+      Usage should be pretty straightforward.
+
+      Specify the audio file that you want to convert to SRT, and specify the model that you want to use.
+
+      On the first run, the model will be downloaded automatically.
+
+      Here I'm using the turbo model since that is usually the best option.
+
+      If everything goes well, this command will generate a file called audio.srt.
+    ")
+]
+
+#slide[
+    fourth
+
+    #toolbox.pdfpc.speaker-note("
+      The turbo model requires 6 GB of video memory.
+
+      If you want to use less video memory, then use the tiny, base, small, or medium model.
+
+      Whisper offers two model variants: English-specific models and multilingual models.
+
+      If you need only english, then use an english-only model such as small.en.
+    ")
+]
+
+#slide[
+    fifth
+
+    #toolbox.pdfpc.speaker-note("
+      Overall, whisper is a great tool for generating SRT files.
+
+      But it's not perfect.
+
+      It's usually a good idea to manually review the generated SRT file.
+      Think of whisper as a starting point.
+      It will have done 95% of the work for you, it's up to you to verify correctness.
+    ")
+]