File tree Expand file tree Collapse file tree 3 files changed +38
-4
lines changed
Expand file tree Collapse file tree 3 files changed +38
-4
lines changed Original file line number Diff line number Diff line change 33import re
44from pathlib import Path
55
6- import numpy as np
76import soundfile as sf
87import streamlit as st
98
1514from document_to_podcast .config import DEFAULT_PROMPT , DEFAULT_SPEAKERS , Speaker
1615from document_to_podcast .inference .text_to_speech import text_to_speech
1716from document_to_podcast .inference .text_to_text import text_to_text_stream
17+ from document_to_podcast .utils import stack_audio_segments
1818
1919
2020@st .cache_resource
@@ -173,7 +173,9 @@ def gen_button_clicked():
173173
174174 if st .session_state [gen_button ]:
175175 if st .button ("Save Podcast to audio file" ):
176- st .session_state .audio = np .concatenate (st .session_state .audio )
176+ st .session_state .audio = stack_audio_segments (
177+ st .session_state .audio , speech_model .audio_codec .sr
178+ )
177179 sf .write (
178180 "podcast.wav" ,
179181 st .session_state .audio ,
Original file line number Diff line number Diff line change 11import re
22from pathlib import Path
33
4- import numpy as np
54import soundfile as sf
65import yaml
76from fire import Fire
2221from document_to_podcast .inference .text_to_text import text_to_text_stream
2322from document_to_podcast .inference .text_to_speech import text_to_speech
2423from document_to_podcast .preprocessing import DATA_CLEANERS , DATA_LOADERS
24+ from document_to_podcast .utils import stack_audio_segments
2525
2626
2727@logger .catch (reraise = True )
@@ -159,9 +159,13 @@ def document_to_podcast(
159159 logger .warning ("Podcast generation stopped by user." )
160160
161161 logger .info ("Saving Podcast..." )
162+ complete_audio = stack_audio_segments (
163+ podcast_audio , sample_rate = sample_rate , silence_pad = 1.0
164+ )
165+
162166 sf .write (
163167 str (output_folder / "podcast.wav" ),
164- np . concatenate ( podcast_audio ) ,
168+ complete_audio ,
165169 samplerate = sample_rate ,
166170 )
167171 (output_folder / "podcast.txt" ).write_text (podcast_script )
Original file line number Diff line number Diff line change 1+ from typing import List
2+
3+ import numpy as np
4+
5+
6+ def stack_audio_segments (
7+ audio_segments : List [np .ndarray ], sample_rate : int , silence_pad : float = 1.0
8+ ) -> np .ndarray :
9+ """
10+ Stack / concatenate all the individual audio segments (speaker audios) sequentially to form the complete podcast.
11+ Additionally, at the end of each speaker's audio, add a small silence audio as buffer between speakers for a more
12+ natural sounding podcast. You can turn off this feature by setting silence_pad = 0.0
13+ Args:
14+ audio_segments: A list of each speaker's audio in order.
15+ sample_rate: The sample rate of the waveform generated by the model.
16+ silence_pad: The maximum length of silence to pad at the end of each audio, sampling between 0.0 and this number.
17+
18+ Returns: The complete podcast as a single, concatenated waveform.
19+
20+ """
21+ stacked = []
22+ rng = np .random .default_rng (42 )
23+ for segment in audio_segments :
24+ stacked .append (segment )
25+ stacked .append (
26+ np .zeros (int (rng .uniform (low = 0.0 , high = silence_pad ) * sample_rate ))
27+ )
28+ return np .concatenate (stacked )
You can’t perform that action at this time.
0 commit comments