5858import numpy as np
5959from tqdm import tqdm
6060import soundfile as sf
61- from typing import Union
61+ from typing import Union , Optional , Callable
6262
6363from sdialog .audio .tts import BaseTTS
64- from sdialog .audio .dialog import AudioDialog
6564from sdialog .audio .room import Room , RoomPosition
66- from sdialog .audio .utils import AudioUtils , SourceVolume , Role , logger
6765from sdialog .audio .acoustics_simulator import AcousticsSimulator
6866from sdialog .audio .voice_database import BaseVoiceDatabase , Voice
67+ from sdialog .audio .dialog import AudioDialog , RoomAcousticsConfig
68+ from sdialog .audio .utils import SourceVolume , Role , logger
6969
7070device = "cuda" if torch .cuda .is_available () else "cpu"
7171
@@ -79,7 +79,8 @@ def generate_utterances_audios(
7979 keep_duplicate : bool = False ,
8080 seed : int = None ,
8181 sampling_rate : int = 24_000 ,
82- tts_pipeline_kwargs : dict = {}
82+ tts_pipeline_kwargs : dict = {},
83+ remove_silences : bool = True
8384) -> AudioDialog :
8485 """
8586 Generates audio for each utterance in an AudioDialog object using the specified TTS engine.
@@ -113,6 +114,8 @@ def generate_utterances_audios(
113114 :type seed: int
114115 :param sampling_rate: Sampling rate for the audio generation.
115116 :type sampling_rate: int
117+ :param remove_silences: If True, remove the silences at the beginning and the end of the audio.
118+ :type remove_silences: bool
116119 :return: The AudioDialog object with generated audio for each turn.
117120 :rtype: AudioDialog
118121 """
@@ -137,7 +140,7 @@ def generate_utterances_audios(
137140
138141 # Generate the utterance audio
139142 utterance_audio , utterance_sampling_rate = generate_utterance (
140- text = AudioUtils . remove_audio_tags ( turn .text ) ,
143+ text = turn .text ,
141144 voice = turn .voice ,
142145 tts_pipeline = tts_pipeline ,
143146 tts_pipeline_kwargs = tts_pipeline_kwargs
@@ -156,9 +159,16 @@ def generate_utterances_audios(
156159 target_sr = sampling_rate ,
157160 )
158161
162+ # Remove the silences at the beginning and the end of the audio
163+ if remove_silences :
164+ utterance_audio , _ = librosa .effects .trim (utterance_audio , top_db = 60 )
165+
159166 # Set the utterance audio to the turn
160167 turn .set_audio (utterance_audio , sampling_rate )
161168
169+ # Set the audio duration of the turn
170+ turn .audio_duration = utterance_audio .shape [0 ] / sampling_rate
171+
162172 return dialog
163173
164174
@@ -188,7 +198,12 @@ def generate_utterance(
188198 :return: A tuple containing the audio data as a numpy array and the sampling rate.
189199 :rtype: tuple[np.ndarray, int]
190200 """
191- return tts_pipeline .generate (text , speaker_voice = voice , tts_pipeline_kwargs = tts_pipeline_kwargs )
201+ audio , sr = tts_pipeline .generate (text , speaker_voice = voice , tts_pipeline_kwargs = tts_pipeline_kwargs )
202+
203+ if isinstance (audio , torch .Tensor ):
204+ audio = audio .cpu ().numpy ()
205+
206+ return audio , sr
192207
193208
194209def generate_audio_room_accoustic (
@@ -201,7 +216,9 @@ def generate_audio_room_accoustic(
201216 audio_file_format : str = "wav" ,
202217 background_effect : str = "white_noise" ,
203218 foreground_effect : str = "ac_noise_minimal" ,
204- foreground_effect_position : RoomPosition = RoomPosition .TOP_RIGHT
219+ foreground_effect_position : RoomPosition = RoomPosition .TOP_RIGHT ,
220+ callback_mix_fn : Optional [Callable ] = None ,
221+ callback_mix_kwargs : dict = {}
205222) -> AudioDialog :
206223 """
207224 Generates room acoustics simulation for the dialogue audio.
@@ -237,16 +254,29 @@ def generate_audio_room_accoustic(
237254 :type foreground_effect: str
238255 :param foreground_effect_position: Position for foreground effects.
239256 :type foreground_effect_position: RoomPosition
257+ :param callback_mix_fn: Callback function to apply to the mixed audio.
258+ :type callback_mix_fn: Optional[Callable]
259+ :param callback_mix_kwargs: Keyword arguments for the callback function.
260+ :type callback_mix_kwargs: dict
240261 :return: The AudioDialog with room acoustics simulation results and file paths.
241262 :rtype: AudioDialog
242263 """
243264
244265 # Create the room acoustics simulator
245266 room_acoustics = AcousticsSimulator (room = room , kwargs_pyroom = kwargs_pyroom )
246267
268+ # Prepare callback kwargs
269+ _callback_mix_kwargs = callback_mix_kwargs .copy () if callback_mix_kwargs is not None else {}
270+
271+ # Add dialog to kwargs if not present
272+ if "dialog" not in _callback_mix_kwargs :
273+ _callback_mix_kwargs ["dialog" ] = dialog
274+
247275 _audio_accoustic = room_acoustics .simulate (
248276 sources = dialog .get_audio_sources (),
249- source_volumes = source_volumes
277+ source_volumes = source_volumes ,
278+ callback_mix_fn = callback_mix_fn ,
279+ callback_mix_kwargs = _callback_mix_kwargs ,
250280 )
251281
252282 # Save the audio file
@@ -270,28 +300,28 @@ def generate_audio_room_accoustic(
270300 # If the audio paths post processing are already in the dialog, use them, otherwise create a new dictionary
271301 if (
272302 room_name in dialog .audio_step_3_filepaths
273- and "audio_paths_post_processing" in dialog .audio_step_3_filepaths [room_name ]
274- and dialog .audio_step_3_filepaths [room_name ][ " audio_paths_post_processing" ] != {}
303+ and dialog .audio_step_3_filepaths [room_name ]. audio_paths_post_processing is not None
304+ and dialog .audio_step_3_filepaths [room_name ]. audio_paths_post_processing != {}
275305 ):
276- audio_paths_post_processing = dialog .audio_step_3_filepaths [room_name ][ " audio_paths_post_processing" ]
306+ audio_paths_post_processing = dialog .audio_step_3_filepaths [room_name ]. audio_paths_post_processing
277307 logger .info (
278308 f"Existing audio paths for the post processing stage "
279309 f"already exist for room name: '{ room_name } ' and are kept unchanged"
280310 )
281311 else :
282312 audio_paths_post_processing = {}
283313
284- dialog .audio_step_3_filepaths [room_name ] = {
285- " audio_path" : current_room_audio_path ,
286- " microphone_position" : room .mic_position ,
287- " room_name" : room_name ,
288- " room" : room ,
289- " source_volumes" : source_volumes ,
290- " kwargs_pyroom" : kwargs_pyroom ,
291- " background_effect" : background_effect ,
292- " foreground_effect" : foreground_effect ,
293- " foreground_effect_position" : foreground_effect_position ,
294- " audio_paths_post_processing" : audio_paths_post_processing
295- }
314+ dialog .audio_step_3_filepaths [room_name ] = RoomAcousticsConfig (
315+ audio_path = current_room_audio_path ,
316+ microphone_position = room .mic_position ,
317+ room_name = room_name ,
318+ room = room ,
319+ source_volumes = source_volumes ,
320+ kwargs_pyroom = kwargs_pyroom ,
321+ background_effect = background_effect ,
322+ foreground_effect = foreground_effect ,
323+ foreground_effect_position = foreground_effect_position ,
324+ audio_paths_post_processing = audio_paths_post_processing ,
325+ )
296326
297327 return dialog
0 commit comments