Skip to content

Commit 8e47ea3

Browse files
authored
Fix video and audio going out of sync (#25)
This problem might have been there since the start, but I only noticed it with a longer video. Closes #24.
1 parent f774465 commit 8e47ea3

File tree

4 files changed

+176
-2
lines changed

4 files changed

+176
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ _out/
1313
tests/_cache_out/
1414
tests/_compatible_out/
1515
tests/_google_out/
16+
tests/_duration_matches_out/
1617

src/video.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,19 @@ pub(crate) fn combine_video(dir: &str, slides: &Vec<Slide>, output: &str, audio_
175175
.arg("concat")
176176
.arg("-i")
177177
.arg(concat_list)
178-
.arg("-c")
179-
.arg("copy")
178+
.arg("-c:v")
179+
// Re-encode to ensure video can be trimmed.
180+
.arg("libx264")
180181
.arg("-c:a")
181182
.arg(audio_codec)
183+
// Experimental is required for opus.
182184
.arg("-strict")
183185
.arg("experimental")
186+
// Aresample in attempt to fix audio sync.
187+
.arg("-af")
188+
.arg("aresample=async=1")
189+
// To avoid pauses at the end of the video.
190+
.arg("-shortest")
184191
.arg(output_path)
185192
.output()
186193
.expect("Failed to run ffmpeg command");

tests/cli.rs

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,89 @@ fn google_provider() -> Result<(), Box<dyn std::error::Error>> {
159159

160160
Ok(())
161161
}
162+
163+
fn convert_to_mp3(input: &str, output: &str) {
164+
let output = std::process::Command::new("ffmpeg")
165+
.arg("-y")
166+
.arg("-i")
167+
.arg(input)
168+
.arg("-q:a")
169+
.arg("0")
170+
.arg("-map")
171+
.arg("a")
172+
.arg(output)
173+
.output()
174+
.expect("Failed to run ffmpeg command");
175+
if !output.status.success() {
176+
let stderr = String::from_utf8_lossy(&output.stderr);
177+
tracing::error!("Failed to convert to mp3: {stderr}");
178+
std::process::exit(1);
179+
} else {
180+
tracing::info!("Converted to mp3");
181+
}
182+
}
183+
184+
fn probe_duration(path: &str) -> Option<String> {
185+
let output = std::process::Command::new("ffprobe")
186+
.arg("-i")
187+
.arg(path)
188+
.output()
189+
.expect("Failed to run ffprobe command");
190+
if !output.status.success() {
191+
let stderr = String::from_utf8_lossy(&output.stderr);
192+
println!("Failed to probe duration: {stderr}");
193+
return None;
194+
}
195+
let stderr = String::from_utf8_lossy(&output.stderr);
196+
let duration = stderr
197+
.split("Duration: ")
198+
.nth(1)
199+
.unwrap()
200+
.split(",")
201+
.next()
202+
.unwrap();
203+
Some(duration.to_string())
204+
}
205+
206+
fn duration_as_seconds(duration: &str) -> f32 {
207+
let parts = duration.split(":").collect::<Vec<&str>>();
208+
let hours = parts[0].parse::<f32>().unwrap();
209+
let minutes = parts[1].parse::<f32>().unwrap();
210+
let seconds = parts[2].parse::<f32>().unwrap();
211+
hours * 3600.0 + minutes * 60.0 + seconds
212+
}
213+
214+
#[test]
215+
fn test_duration_matches() -> Result<(), Box<dyn std::error::Error>> {
216+
let out_dir = Path::new("tests").join("_duration_matches_out");
217+
let out_dir = out_dir.to_str().unwrap();
218+
println!("out_dir: {out_dir}");
219+
220+
let key = common::load_key(&Provider::Google);
221+
let mut cmd = bin();
222+
cmd.env("GOOGLE_KEY", key);
223+
cmd.arg(format!("--out-dir={}", out_dir));
224+
cmd.arg("--verbose");
225+
cmd.arg("--cache=false");
226+
cmd.arg("build");
227+
cmd.arg("tests/test_duration_matches.typ");
228+
cmd.assert().success();
229+
230+
let video_path = Path::new(out_dir).join("out.mp4");
231+
let video_path = video_path.to_str().unwrap();
232+
let audio_path = Path::new(out_dir).join("out.mp3");
233+
let audio_path = audio_path.to_str().unwrap();
234+
convert_to_mp3(&video_path, &audio_path);
235+
236+
let video_duration = probe_duration(&video_path).unwrap();
237+
println!("video_duration: {video_duration}");
238+
let video_duration = duration_as_seconds(&video_duration);
239+
println!("video_duration: {video_duration} seconds");
240+
let audio_duration = probe_duration(&audio_path).unwrap();
241+
println!("audio_duration: {audio_duration}");
242+
let audio_duration = duration_as_seconds(&audio_duration);
243+
println!("audio_duration: {audio_duration} seconds");
244+
assert!(video_duration - audio_duration < 0.1);
245+
246+
Ok(())
247+
}

tests/test_duration_matches.typ

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#import "@preview/polylux:0.4.0": *
2+
3+
#set page(paper: "presentation-16-9")
4+
#set text(size: 25pt)
5+
6+
// --- trv config:
7+
// provider = "google"
8+
// voice = "en-US-Chirp-HD-D"
9+
// language_code = "en-US"
10+
// ---
11+
12+
#slide[
13+
first
14+
15+
#toolbox.pdfpc.speaker-note("
16+
OpenAI whisper is a tool that can be used to run speech recognition.
17+
18+
It's a great tool for generating SRT files.
19+
20+
In this video, I'll quickly show you how to use it.
21+
")
22+
]
23+
24+
#slide[
25+
second
26+
27+
#toolbox.pdfpc.speaker-note("
28+
To install OpenAI whisper, there are multiple options.
29+
30+
OpenAI advices to use pip install, but on MacOS it's probably easier to use Homebrew.
31+
32+
Note that this installation might take a while.
33+
34+
In case of problems during installation, see the openai whisper repository on GitHub.
35+
")
36+
]
37+
38+
#slide[
39+
third
40+
41+
#toolbox.pdfpc.speaker-note("
42+
Usage should be pretty straightforward.
43+
44+
Specify the audio file that you want to convert to SRT, and specify the model that you want to use.
45+
46+
On the first run, the model will be downloaded automatically.
47+
48+
Here I'm using the turbo model since that is usually the best option.
49+
50+
If everything goes well, this command will generate a file called audio.srt.
51+
")
52+
]
53+
54+
#slide[
55+
fourth
56+
57+
#toolbox.pdfpc.speaker-note("
58+
The turbo model requires 6 GB of video memory.
59+
60+
If you want to use less video memory, then use the tiny, base, small, or medium model.
61+
62+
Whisper offers two model variants: English-specific models and multilingual models.
63+
64+
If you need only english, then use an english-only model such as small.en.
65+
")
66+
]
67+
68+
#slide[
69+
fifth
70+
71+
#toolbox.pdfpc.speaker-note("
72+
Overall, whisper is a great tool for generating SRT files.
73+
74+
But it's not perfect.
75+
76+
It's usually a good idea to manually review the generated SRT file.
77+
Think of whisper as a starting point.
78+
It will have done 95% of the work for you, it's up to you to verify correctness.
79+
")
80+
]

0 commit comments

Comments
 (0)