idiap · sergioburdisso · Mar 13, 2026 · Nov 6, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/docs/api/sdialog.rst b/docs/api/sdialog.rst
@@ -112,7 +112,6 @@ sdialog.evaluation.base
    :members:
    :show-inheritance:
 
-
 ----
 
 sdialog.datasets

diff --git a/docs/sdialog/index.rst b/docs/sdialog/index.rst
@@ -656,7 +656,7 @@ Audio Generation
 The audio module of SDialog extends the core functionality by adding comprehensive audio generation and processing capabilities for dialogues. It enables transforming text dialogues into immersive audio experiences with realistic voices and simulated acoustic environments.
 
 Setup and Installation
----------------------
+----------------------
 
 To work with audio features in SDialog, you'll need to install additional dependencies and system packages:
 

diff --git a/requirements-audio-test.txt b/requirements-audio-test.txt
@@ -5,4 +5,4 @@ jams
 pyloudnorm
 pyroomacoustics
 huggingface_hub[cli]
-dscaper>=1.7.0
+dscaper>=1.7.7
diff --git a/requirements-audio.txt b/requirements-audio.txt
@@ -4,7 +4,7 @@ sox
 jams
 pyloudnorm
 pyroomacoustics
-datasets<=3.6.0
+datasets<=2.21.0
 huggingface_hub[cli]
-dscaper>=1.7.0
-qwen-tts
+dscaper>=1.7.7
+whisper-normalization
diff --git a/src/sdialog/__init__.py b/src/sdialog/__init__.py
@@ -474,7 +474,7 @@ def to_audio(
         This is a convenience wrapper around the full `sdialog.audio.pipeline.to_audio` function.
         All keyword arguments are passed to it.
 
-        :param path: Directory path for storing audio outputs.
+        :param path: Path to the audio file or directory for storing audio outputs.
         :type path: str
         :param dialog_dir_name: Custom name for the dialogue directory.
         :type dialog_dir_name: str
@@ -510,8 +510,6 @@ def to_audio(
         :type audio_file_format: str
         :param seed: Seed for random number generator.
         :type seed: int
-        :param re_sampling_rate: Re-sampling rate for the output audio.
-        :type re_sampling_rate: Optional[int]
         :param recording_devices: The identifiers of the recording devices to simulate.
         :type recording_devices: Optional[List[Union[RecordingDevice, str]]]
         :param impulse_response_database: The database for impulse responses.
@@ -520,6 +518,21 @@ def to_audio(
         :type override_tts_audio: Optional[bool]
         :param verbose: Verbose mode for logging.
         :type verbose: Optional[bool]
+        :param overlap_pauses: Generate the audio with overlapping and pausing between turns using LLM.
+        :type overlap_pauses: Optional[bool]
+        :param add_sound_effects: Add sound effects (such as door opening, footsteps, etc.) to the audio.
+        :type add_sound_effects: Optional[bool]
+        :param sound_effects_dropout: Dropout rate for sound effects.
+        :type sound_effects_dropout: Optional[float]
+        :param skip_annotation: Whether to skip the annotation of the sound effects
+                                (if your dialogs are already annotated with sound effects tags, you can skip this step).
+        :type skip_annotation: Optional[bool]
+        :param remove_silences: Remove the silences at the beginning and the end of the audio.
+        :type remove_silences: Optional[bool]
+        :param callback_mix_fn: Callback function to apply to the mixed audio.
+        :type callback_mix_fn: Optional[Callable]
+        :param callback_mix_kwargs: Keyword arguments for the callback function.
+        :type callback_mix_kwargs: dict
         :return: Audio dialogue with processed audio data.
         :rtype: "sdialog.audio.dialog.AudioDialog"
         :raises Exception: If the audio module is not installed.

diff --git a/src/sdialog/audio/__init__.py b/src/sdialog/audio/__init__.py
@@ -58,14 +58,14 @@
 import numpy as np
 from tqdm import tqdm
 import soundfile as sf
-from typing import Union
+from typing import Union, Optional, Callable
 
 from sdialog.audio.tts import BaseTTS
-from sdialog.audio.dialog import AudioDialog
 from sdialog.audio.room import Room, RoomPosition
-from sdialog.audio.utils import AudioUtils, SourceVolume, Role, logger
 from sdialog.audio.acoustics_simulator import AcousticsSimulator
 from sdialog.audio.voice_database import BaseVoiceDatabase, Voice
+from sdialog.audio.dialog import AudioDialog, RoomAcousticsConfig
+from sdialog.audio.utils import SourceVolume, Role, logger
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
@@ -79,7 +79,8 @@ def generate_utterances_audios(
     keep_duplicate: bool = False,
     seed: int = None,
     sampling_rate: int = 24_000,
-    tts_pipeline_kwargs: dict = {}
+    tts_pipeline_kwargs: dict = {},
+    remove_silences: bool = True
 ) -> AudioDialog:
     """
     Generates audio for each utterance in an AudioDialog object using the specified TTS engine.
@@ -113,6 +114,8 @@ def generate_utterances_audios(
     :type seed: int
     :param sampling_rate: Sampling rate for the audio generation.
     :type sampling_rate: int
+    :param remove_silences: If True, remove the silences at the beginning and the end of the audio.
+    :type remove_silences: bool
     :return: The AudioDialog object with generated audio for each turn.
     :rtype: AudioDialog
     """
@@ -137,7 +140,7 @@ def generate_utterances_audios(
 
         # Generate the utterance audio
         utterance_audio, utterance_sampling_rate = generate_utterance(
-            text=AudioUtils.remove_audio_tags(turn.text),
+            text=turn.text,
             voice=turn.voice,
             tts_pipeline=tts_pipeline,
             tts_pipeline_kwargs=tts_pipeline_kwargs
@@ -156,9 +159,16 @@ def generate_utterances_audios(
                 target_sr=sampling_rate,
             )
 
+        # Remove the silences at the beginning and the end of the audio
+        if remove_silences:
+            utterance_audio, _ = librosa.effects.trim(utterance_audio, top_db=60)
+
         # Set the utterance audio to the turn
         turn.set_audio(utterance_audio, sampling_rate)
 
+        # Set the audio duration of the turn
+        turn.audio_duration = utterance_audio.shape[0] / sampling_rate
+
     return dialog
 
 
@@ -188,7 +198,12 @@ def generate_utterance(
     :return: A tuple containing the audio data as a numpy array and the sampling rate.
     :rtype: tuple[np.ndarray, int]
     """
-    return tts_pipeline.generate(text, speaker_voice=voice, tts_pipeline_kwargs=tts_pipeline_kwargs)
+    audio, sr = tts_pipeline.generate(text, speaker_voice=voice, tts_pipeline_kwargs=tts_pipeline_kwargs)
+
+    if isinstance(audio, torch.Tensor):
+        audio = audio.cpu().numpy()
+
+    return audio, sr
 
 
 def generate_audio_room_accoustic(
@@ -201,7 +216,9 @@ def generate_audio_room_accoustic(
     audio_file_format: str = "wav",
     background_effect: str = "white_noise",
     foreground_effect: str = "ac_noise_minimal",
-    foreground_effect_position: RoomPosition = RoomPosition.TOP_RIGHT
+    foreground_effect_position: RoomPosition = RoomPosition.TOP_RIGHT,
+    callback_mix_fn: Optional[Callable] = None,
+    callback_mix_kwargs: dict = {}
 ) -> AudioDialog:
     """
     Generates room acoustics simulation for the dialogue audio.
@@ -237,16 +254,29 @@ def generate_audio_room_accoustic(
     :type foreground_effect: str
     :param foreground_effect_position: Position for foreground effects.
     :type foreground_effect_position: RoomPosition
+    :param callback_mix_fn: Callback function to apply to the mixed audio.
+    :type callback_mix_fn: Optional[Callable]
+    :param callback_mix_kwargs: Keyword arguments for the callback function.
+    :type callback_mix_kwargs: dict
     :return: The AudioDialog with room acoustics simulation results and file paths.
     :rtype: AudioDialog
     """
 
     # Create the room acoustics simulator
     room_acoustics = AcousticsSimulator(room=room, kwargs_pyroom=kwargs_pyroom)
 
+    # Prepare callback kwargs
+    _callback_mix_kwargs = callback_mix_kwargs.copy() if callback_mix_kwargs is not None else {}
+
+    # Add dialog to kwargs if not present
+    if "dialog" not in _callback_mix_kwargs:
+        _callback_mix_kwargs["dialog"] = dialog
+
     _audio_accoustic = room_acoustics.simulate(
         sources=dialog.get_audio_sources(),
-        source_volumes=source_volumes
+        source_volumes=source_volumes,
+        callback_mix_fn=callback_mix_fn,
+        callback_mix_kwargs=_callback_mix_kwargs,
     )
 
     # Save the audio file
@@ -270,28 +300,28 @@ def generate_audio_room_accoustic(
     # If the audio paths post processing are already in the dialog, use them, otherwise create a new dictionary
     if (
         room_name in dialog.audio_step_3_filepaths
-        and "audio_paths_post_processing" in dialog.audio_step_3_filepaths[room_name]
-        and dialog.audio_step_3_filepaths[room_name]["audio_paths_post_processing"] != {}
+        and dialog.audio_step_3_filepaths[room_name].audio_paths_post_processing is not None
+        and dialog.audio_step_3_filepaths[room_name].audio_paths_post_processing != {}
     ):
-        audio_paths_post_processing = dialog.audio_step_3_filepaths[room_name]["audio_paths_post_processing"]
+        audio_paths_post_processing = dialog.audio_step_3_filepaths[room_name].audio_paths_post_processing
         logger.info(
             f"Existing audio paths for the post processing stage "
             f"already exist for room name: '{room_name}' and are kept unchanged"
         )
     else:
         audio_paths_post_processing = {}
 
-    dialog.audio_step_3_filepaths[room_name] = {
-        "audio_path": current_room_audio_path,
-        "microphone_position": room.mic_position,
-        "room_name": room_name,
-        "room": room,
-        "source_volumes": source_volumes,
-        "kwargs_pyroom": kwargs_pyroom,
-        "background_effect": background_effect,
-        "foreground_effect": foreground_effect,
-        "foreground_effect_position": foreground_effect_position,
-        "audio_paths_post_processing": audio_paths_post_processing
-    }
+    dialog.audio_step_3_filepaths[room_name] = RoomAcousticsConfig(
+        audio_path=current_room_audio_path,
+        microphone_position=room.mic_position,
+        room_name=room_name,
+        room=room,
+        source_volumes=source_volumes,
+        kwargs_pyroom=kwargs_pyroom,
+        background_effect=background_effect,
+        foreground_effect=foreground_effect,
+        foreground_effect_position=foreground_effect_position,
+        audio_paths_post_processing=audio_paths_post_processing,
+    )
 
     return dialog
diff --git a/src/sdialog/audio/acoustics_simulator.py b/src/sdialog/audio/acoustics_simulator.py
@@ -49,10 +49,10 @@
 import os
 import numpy as np
 import soundfile as sf
-from typing import List
+from typing import List, Callable, Optional
 
 from sdialog.audio.utils import logger, SourceVolume
-from sdialog.audio.room import Room, AudioSource, RoomPosition, DirectivityType
+from sdialog.audio.room import Room, AudioSource, RoomPosition, DirectivityType, Position3D
 
 
 class AcousticsSimulator:
@@ -230,6 +230,8 @@ def _add_sources(
 
         for i, audio_source in enumerate(audiosources):
 
+            # audio_source.position = audio_source.position.replace("sfx|", "")
+
             self.audiosources.append(audio_source)
 
             # Get the position of the audio source
@@ -242,6 +244,22 @@ def _add_sources(
             elif audio_source.position.startswith("speaker_"):  # speaker_ is the speaker sound
                 _position3d = self.room.speakers_positions[audio_source.position]
 
+            # Check if the position corresponds to a furniture
+            elif audio_source.position in self.room.furnitures:
+                furniture = self.room.furnitures[audio_source.position]
+                _position3d = Position3D(
+                    furniture.x + furniture.width / 2,
+                    furniture.y + furniture.depth / 2,
+                    furniture.get_top_z()
+                )
+
+            else:
+                logger.warning(
+                    f"Unknown position '{audio_source.position}' for audio source '{audio_source.name}'. "
+                    "Placing it at the center of the room."
+                )
+                _position3d = self.room.room_position_to_position3d(RoomPosition.CENTER)
+
             # Load the audio file from the file system for the audio source
             if audio_source.source_file and os.path.exists(audio_source.source_file):
 
@@ -279,7 +297,9 @@ def simulate(
         self,
         sources: List[AudioSource] = [],
         source_volumes: dict[str, SourceVolume] = {},
-        reset: bool = False
+        reset: bool = False,
+        callback_mix_fn: Optional[Callable] = None,
+        callback_mix_kwargs: Optional[dict] = None
     ):
         """
         Simulates room acoustics for the given audio sources.
@@ -301,6 +321,10 @@ def simulate(
         :type source_volumes: dict[str, SourceVolume]
         :param reset: If True, resets the room acoustics simulator before simulation.
         :type reset: bool
+        :param callback_mix_fn: Callback function to apply to the mixed audio.
+        :type callback_mix_fn: Optional[Callable]
+        :param callback_mix_kwargs: Keyword arguments for the callback function.
+        :type callback_mix_kwargs: dict
         :return: Processed audio with room acoustics effects applied.
         :rtype: np.ndarray
         :raises ValueError: If audio sources are invalid or empty.
@@ -316,7 +340,10 @@ def simulate(
             self._add_sources(sources, source_volumes)
 
             logger.info("[Step 3] Simulating room acoustics...")
-            self._pyroom.simulate()
+            self._pyroom.simulate(
+                callback_mix=callback_mix_fn if callback_mix_fn is not None else None,
+                callback_mix_kwargs=callback_mix_kwargs if callback_mix_fn is not None else {}
+            )
 
         except ValueError as e:
 
@@ -361,6 +388,7 @@ def reset(self):
 
         del self._pyroom
         self._pyroom = None
+        self.audiosources = []
 
     @staticmethod
     def apply_snr(x, snr):
-Original file line number
+Diff line change
@@ Expand Up / @@ -112,7 +112,6 @@ sdialog.evaluation.base @@
        :members:
        :show-inheritance:
     ----
     sdialog.datasets
@@ Expand Down @@