Add random-lenght silences. (#79)

daavoo · Kostis-S-Z · web-flow · commit 8809feb3c1fd · 2025-01-09T19:29:14.000+01:00
* Add random-lenght silences.

Insert silences of lenght between 0.1 and 1.1 seconds between each speaker segment.

* Add silence_pad argument, type hints and comment

---------

Co-authored-by: Kostis-S-Z &lt;kostissz@pm.me&gt;
Co-authored-by: Kostis &lt;Kostis-S-Z@users.noreply.github.com&gt;
diff --git a/demo/app.py b/demo/app.py
@@ -3,7 +3,6 @@
 import re
 from pathlib import Path
 
-import numpy as np
 import soundfile as sf
 import streamlit as st
 
@@ -15,6 +14,7 @@
 from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
 from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.inference.text_to_text import text_to_text_stream
+from document_to_podcast.utils import stack_audio_segments
 
 
 @st.cache_resource
@@ -173,7 +173,9 @@ def gen_button_clicked():
 
     if st.session_state[gen_button]:
         if st.button("Save Podcast to audio file"):
-            st.session_state.audio = np.concatenate(st.session_state.audio)
+            st.session_state.audio = stack_audio_segments(
+                st.session_state.audio, speech_model.audio_codec.sr
+            )
             sf.write(
                 "podcast.wav",
                 st.session_state.audio,
diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -1,7 +1,6 @@
 import re
 from pathlib import Path
 
-import numpy as np
 import soundfile as sf
 import yaml
 from fire import Fire
@@ -22,6 +21,7 @@
 from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS
+from document_to_podcast.utils import stack_audio_segments
 
 
 @logger.catch(reraise=True)
@@ -159,9 +159,13 @@ def document_to_podcast(
         logger.warning("Podcast generation stopped by user.")
 
     logger.info("Saving Podcast...")
+    complete_audio = stack_audio_segments(
+        podcast_audio, sample_rate=sample_rate, silence_pad=1.0
+    )
+
     sf.write(
         str(output_folder / "podcast.wav"),
-        np.concatenate(podcast_audio),
+        complete_audio,
         samplerate=sample_rate,
     )
     (output_folder / "podcast.txt").write_text(podcast_script)
diff --git a/src/document_to_podcast/utils.py b/src/document_to_podcast/utils.py
@@ -0,0 +1,28 @@
+from typing import List
+
+import numpy as np
+
+
+def stack_audio_segments(
+    audio_segments: List[np.ndarray], sample_rate: int, silence_pad: float = 1.0
+) -> np.ndarray:
+    """
+    Stack / concatenate all the individual audio segments (speaker audios) sequentially to form the complete podcast.
+    Additionally, at the end of each speaker's audio, add a small silence audio as buffer between speakers for a more
+    natural sounding podcast. You can turn off this feature by setting silence_pad = 0.0
+    Args:
+        audio_segments: A list of each speaker's audio in order.
+        sample_rate: The sample rate of the waveform generated by the model.
+        silence_pad: The maximum length of silence to pad at the end of each audio, sampling between 0.0 and this number.
+
+    Returns: The complete podcast as a single, concatenated waveform.
+
+    """
+    stacked = []
+    rng = np.random.default_rng(42)
+    for segment in audio_segments:
+        stacked.append(segment)
+        stacked.append(
+            np.zeros(int(rng.uniform(low=0.0, high=silence_pad) * sample_rate))
+        )
+    return np.concatenate(stacked)