|
| 1 | +use crate::audio_format; |
1 | 2 | use crate::path::audio_path; |
2 | 3 | use crate::path::image_path; |
3 | 4 | use crate::slide::Slide; |
| 5 | +use crate::Config; |
| 6 | +use crate::Provider; |
4 | 7 | use chrono::NaiveTime; |
5 | 8 | use chrono::SubsecRound; |
6 | 9 | use chrono::Timelike; |
@@ -143,29 +146,54 @@ fn stream_index(slide: &Slide, stream: Stream) -> usize { |
143 | 146 | } |
144 | 147 | } |
145 | 148 |
|
| 149 | +/// Pause duration for transitions. |
| 150 | +/// |
| 151 | +/// Sentences normally have a pause between them. Without this pause, |
| 152 | +/// sentences around slide transitions will be too close to each other. |
| 153 | +/// According to Goldman-Eisler (1968), articulatory pauses are typically |
| 154 | +/// below 250 ms while hesitation pauses are typically above that. |
| 155 | +fn transition_pause(config: &Config, provider: &Provider) -> chrono::Duration { |
| 156 | + // Google does not automatically have a pause between audio clips. |
| 157 | + if provider == &Provider::Google { |
| 158 | + return chrono::Duration::milliseconds(200); |
| 159 | + } |
| 160 | + if let Some(model) = &config.model { |
| 161 | + // Nor does the Zyphra Zonos model. |
| 162 | + if model.to_lowercase().contains("zonos") { |
| 163 | + return chrono::Duration::milliseconds(200); |
| 164 | + } |
| 165 | + } |
| 166 | + chrono::Duration::milliseconds(0) |
| 167 | +} |
| 168 | + |
146 | 169 | pub(crate) fn combine_video( |
147 | 170 | dir: &str, |
148 | 171 | slides: &Vec<Slide>, |
| 172 | + config: &Config, |
| 173 | + provider: &Provider, |
149 | 174 | output: &str, |
150 | 175 | audio_codec: &str, |
151 | | - audio_ext: &str, |
152 | 176 | ) { |
| 177 | + let audio_ext = audio_format(config); |
153 | 178 | tracing::info!("Combining images and audio into one video..."); |
154 | 179 | let output = Path::new(dir).join(output); |
155 | 180 | let output_path = output.to_str().unwrap(); |
156 | 181 |
|
157 | 182 | let mut cmd = std::process::Command::new("ffmpeg"); |
158 | 183 | cmd.arg("-y"); |
159 | | - for slide in slides { |
160 | | - let audio_path = audio_path(dir, slide, audio_ext); |
| 184 | + let n = slides.len(); |
| 185 | + for (i, slide) in slides.iter().enumerate() { |
| 186 | + let audio_path = audio_path(dir, slide, &audio_ext); |
161 | 187 | cmd.arg("-i").arg(&audio_path); |
162 | 188 | let image_path = image_path(dir, slide); |
163 | | - // Sentences normally have a pause between them. Without this pause, |
164 | | - // sentences around slide transitions will be too close to each other. |
165 | | - // According to Goldman-Eisler (1968), articulatory pauses are typically |
166 | | - // below 250 ms while hesitation pauses are typically above that. |
167 | | - let transition_pause = chrono::Duration::milliseconds(200); |
168 | | - let duration = probe_duration(&audio_path).unwrap() + transition_pause; |
| 189 | + let pause = if i < n - 1 { |
| 190 | + transition_pause(config, provider) |
| 191 | + } else { |
| 192 | + // Sometimes the audio is trimmed at the end. Adding a small pause |
| 193 | + // to avoid this. |
| 194 | + chrono::Duration::milliseconds(500) |
| 195 | + }; |
| 196 | + let duration = probe_duration(&audio_path).unwrap() + pause; |
169 | 197 | cmd.arg("-loop") |
170 | 198 | .arg("1") |
171 | 199 | .arg("-framerate") |
|
0 commit comments