Skip to content

Commit 8809feb

Browse files
daavooKostis-S-Z
andauthored
Add random-lenght silences. (#79)
* Add random-lenght silences. Insert silences of lenght between 0.1 and 1.1 seconds between each speaker segment. * Add silence_pad argument, type hints and comment --------- Co-authored-by: Kostis-S-Z <kostissz@pm.me> Co-authored-by: Kostis <Kostis-S-Z@users.noreply.github.com>
1 parent faf5f9a commit 8809feb

File tree

3 files changed

+38
-4
lines changed

3 files changed

+38
-4
lines changed

demo/app.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import re
44
from pathlib import Path
55

6-
import numpy as np
76
import soundfile as sf
87
import streamlit as st
98

@@ -15,6 +14,7 @@
1514
from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
1615
from document_to_podcast.inference.text_to_speech import text_to_speech
1716
from document_to_podcast.inference.text_to_text import text_to_text_stream
17+
from document_to_podcast.utils import stack_audio_segments
1818

1919

2020
@st.cache_resource
@@ -173,7 +173,9 @@ def gen_button_clicked():
173173

174174
if st.session_state[gen_button]:
175175
if st.button("Save Podcast to audio file"):
176-
st.session_state.audio = np.concatenate(st.session_state.audio)
176+
st.session_state.audio = stack_audio_segments(
177+
st.session_state.audio, speech_model.audio_codec.sr
178+
)
177179
sf.write(
178180
"podcast.wav",
179181
st.session_state.audio,

src/document_to_podcast/cli.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import re
22
from pathlib import Path
33

4-
import numpy as np
54
import soundfile as sf
65
import yaml
76
from fire import Fire
@@ -22,6 +21,7 @@
2221
from document_to_podcast.inference.text_to_text import text_to_text_stream
2322
from document_to_podcast.inference.text_to_speech import text_to_speech
2423
from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS
24+
from document_to_podcast.utils import stack_audio_segments
2525

2626

2727
@logger.catch(reraise=True)
@@ -159,9 +159,13 @@ def document_to_podcast(
159159
logger.warning("Podcast generation stopped by user.")
160160

161161
logger.info("Saving Podcast...")
162+
complete_audio = stack_audio_segments(
163+
podcast_audio, sample_rate=sample_rate, silence_pad=1.0
164+
)
165+
162166
sf.write(
163167
str(output_folder / "podcast.wav"),
164-
np.concatenate(podcast_audio),
168+
complete_audio,
165169
samplerate=sample_rate,
166170
)
167171
(output_folder / "podcast.txt").write_text(podcast_script)

src/document_to_podcast/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from typing import List
2+
3+
import numpy as np
4+
5+
6+
def stack_audio_segments(
7+
audio_segments: List[np.ndarray], sample_rate: int, silence_pad: float = 1.0
8+
) -> np.ndarray:
9+
"""
10+
Stack / concatenate all the individual audio segments (speaker audios) sequentially to form the complete podcast.
11+
Additionally, at the end of each speaker's audio, add a small silence audio as buffer between speakers for a more
12+
natural sounding podcast. You can turn off this feature by setting silence_pad = 0.0
13+
Args:
14+
audio_segments: A list of each speaker's audio in order.
15+
sample_rate: The sample rate of the waveform generated by the model.
16+
silence_pad: The maximum length of silence to pad at the end of each audio, sampling between 0.0 and this number.
17+
18+
Returns: The complete podcast as a single, concatenated waveform.
19+
20+
"""
21+
stacked = []
22+
rng = np.random.default_rng(42)
23+
for segment in audio_segments:
24+
stacked.append(segment)
25+
stacked.append(
26+
np.zeros(int(rng.uniform(low=0.0, high=silence_pad) * sample_rate))
27+
)
28+
return np.concatenate(stacked)

0 commit comments

Comments
 (0)