Improve

rikhuijzer · rikhuijzer · commit 54a1ea119961 · 2025-03-14T15:12:42.000+01:00
diff --git a/src/main.rs b/src/main.rs
@@ -200,6 +200,10 @@ fn tts_config(config: &Config, provider: &Provider) -> TTSConfig {
     }
 }
 
+pub(crate) fn audio_format(config: &Config) -> String {
+    config.audio_format.clone().unwrap_or("mp3".to_string())
+}
+
 pub(crate) async fn build(
     input: PathBuf,
     config: &Config,
@@ -218,10 +222,7 @@ pub(crate) async fn build(
         panic!("No slides found in input file: {}", input.display());
     }
     image::generate_images(&input, out_dir);
-    let audio_ext = tts_config
-        .output_format
-        .clone()
-        .unwrap_or("mp3".to_string());
+    let audio_ext = audio_format(config);
     let cache = args.cache.unwrap();
     audio::generate_audio_files(
         &provider,
@@ -236,7 +237,7 @@ pub(crate) async fn build(
     let output = "out.mp4";
     if release {
         let audio_codec = audio_codec.unwrap();
-        video::combine_video(out_dir, &slides, output, &audio_codec, &audio_ext);
+        video::combine_video(out_dir, &slides, &config, &provider, output, &audio_codec);
     }
     slides
 }
diff --git a/src/video.rs b/src/video.rs
@@ -1,6 +1,9 @@
+use crate::audio_format;
 use crate::path::audio_path;
 use crate::path::image_path;
 use crate::slide::Slide;
+use crate::Config;
+use crate::Provider;
 use chrono::NaiveTime;
 use chrono::SubsecRound;
 use chrono::Timelike;
@@ -143,29 +146,54 @@ fn stream_index(slide: &Slide, stream: Stream) -> usize {
     }
 }
 
+/// Pause duration for transitions.
+///
+/// Sentences normally have a pause between them. Without this pause,
+/// sentences around slide transitions will be too close to each other.
+/// According to Goldman-Eisler (1968), articulatory pauses are typically
+/// below 250 ms while hesitation pauses are typically above that.
+fn transition_pause(config: &Config, provider: &Provider) -> chrono::Duration {
+    // Google does not automatically have a pause between audio clips.
+    if provider == &Provider::Google {
+        return chrono::Duration::milliseconds(200);
+    }
+    if let Some(model) = &config.model {
+        // Nor does the Zyphra Zonos model.
+        if model.to_lowercase().contains("zonos") {
+            return chrono::Duration::milliseconds(200);
+        }
+    }
+    chrono::Duration::milliseconds(0)
+}
+
 pub(crate) fn combine_video(
     dir: &str,
     slides: &Vec<Slide>,
+    config: &Config,
+    provider: &Provider,
     output: &str,
     audio_codec: &str,
-    audio_ext: &str,
 ) {
+    let audio_ext = audio_format(config);
     tracing::info!("Combining images and audio into one video...");
     let output = Path::new(dir).join(output);
     let output_path = output.to_str().unwrap();
 
     let mut cmd = std::process::Command::new("ffmpeg");
     cmd.arg("-y");
-    for slide in slides {
-        let audio_path = audio_path(dir, slide, audio_ext);
+    let n = slides.len();
+    for (i, slide) in slides.iter().enumerate() {
+        let audio_path = audio_path(dir, slide, &audio_ext);
         cmd.arg("-i").arg(&audio_path);
         let image_path = image_path(dir, slide);
-        // Sentences normally have a pause between them. Without this pause,
-        // sentences around slide transitions will be too close to each other.
-        // According to Goldman-Eisler (1968), articulatory pauses are typically
-        // below 250 ms while hesitation pauses are typically above that.
-        let transition_pause = chrono::Duration::milliseconds(200);
-        let duration = probe_duration(&audio_path).unwrap() + transition_pause;
+        let pause = if i < n - 1 {
+            transition_pause(config, provider)
+        } else {
+            // Sometimes the audio is trimmed at the end. Adding a small pause
+            // to avoid this.
+            chrono::Duration::milliseconds(500)
+        };
+        let duration = probe_duration(&audio_path).unwrap() + pause;
         cmd.arg("-loop")
             .arg("1")
             .arg("-framerate")
diff --git a/src/watch.rs b/src/watch.rs
@@ -1,3 +1,4 @@
+use crate::audio_format;
 use crate::build;
 use crate::slide::Slide;
 use crate::Arguments;
@@ -132,10 +133,6 @@ fn timestamp() -> u64 {
         .as_secs()
 }
 
-fn audio_format(config: &Config) -> String {
-    config.audio_format.clone().unwrap_or("mp3".to_string())
-}
-
 fn move_files_into_public(args: &Arguments, config: &Config, slides: &[Slide]) -> u64 {
     let public_path = public_dir(args);
     let out_dir = &args.out_dir;

Original file line number	Diff line number	Diff line change
`@@ -200,6 +200,10 @@ fn tts_config(config: &Config, provider: &Provider) -> TTSConfig {`
`200`	`200`	`}`
`201`	`201`	`}`
`202`	`202`
	`203`	`+pub(crate) fn audio_format(config: &Config) -> String {`
	`204`	`+ config.audio_format.clone().unwrap_or("mp3".to_string())`
	`205`	`+}`
	`206`	`+`
`203`	`207`	`pub(crate) async fn build(`
`204`	`208`	`input: PathBuf,`
`205`	`209`	`config: &Config,`
`@@ -218,10 +222,7 @@ pub(crate) async fn build(`
`218`	`222`	`panic!("No slides found in input file: {}", input.display());`
`219`	`223`	`}`
`220`	`224`	`image::generate_images(&input, out_dir);`
`221`		`- let audio_ext = tts_config`
`222`		`- .output_format`
`223`		`- .clone()`
`224`		`- .unwrap_or("mp3".to_string());`
	`225`	`+ let audio_ext = audio_format(config);`
`225`	`226`	`let cache = args.cache.unwrap();`
`226`	`227`	`audio::generate_audio_files(`
`227`	`228`	`&provider,`
`@@ -236,7 +237,7 @@ pub(crate) async fn build(`
`236`	`237`	`let output = "out.mp4";`
`237`	`238`	`if release {`
`238`	`239`	`let audio_codec = audio_codec.unwrap();`
`239`		`- video::combine_video(out_dir, &slides, output, &audio_codec, &audio_ext);`
	`240`	`+ video::combine_video(out_dir, &slides, &config, &provider, output, &audio_codec);`
`240`	`241`	`}`
`241`	`242`	`slides`
`242`	`243`	`}`