Skip to content

Commit dd3e674

Browse files
Merge pull request #123 from qanastek/main
New version of the audio module 🍆
2 parents ce4c911 + cc18caf commit dd3e674

35 files changed

+3697
-523
lines changed

docs/api/sdialog.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ sdialog.evaluation.base
112112
:members:
113113
:show-inheritance:
114114

115-
116115
----
117116

118117
sdialog.datasets

docs/sdialog/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ Audio Generation
656656
The audio module of SDialog extends the core functionality by adding comprehensive audio generation and processing capabilities for dialogues. It enables transforming text dialogues into immersive audio experiences with realistic voices and simulated acoustic environments.
657657

658658
Setup and Installation
659-
---------------------
659+
----------------------
660660

661661
To work with audio features in SDialog, you'll need to install additional dependencies and system packages:
662662

requirements-audio-test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ jams
55
pyloudnorm
66
pyroomacoustics
77
huggingface_hub[cli]
8-
dscaper>=1.7.0
8+
dscaper>=1.7.7

requirements-audio.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ sox
44
jams
55
pyloudnorm
66
pyroomacoustics
7-
datasets<=3.6.0
7+
datasets<=2.21.0
88
huggingface_hub[cli]
9-
dscaper>=1.7.0
10-
qwen-tts
9+
dscaper>=1.7.7
10+
whisper-normalization

src/sdialog/__init__.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ def to_audio(
474474
This is a convenience wrapper around the full `sdialog.audio.pipeline.to_audio` function.
475475
All keyword arguments are passed to it.
476476
477-
:param path: Directory path for storing audio outputs.
477+
:param path: Path to the audio file or directory for storing audio outputs.
478478
:type path: str
479479
:param dialog_dir_name: Custom name for the dialogue directory.
480480
:type dialog_dir_name: str
@@ -510,8 +510,6 @@ def to_audio(
510510
:type audio_file_format: str
511511
:param seed: Seed for random number generator.
512512
:type seed: int
513-
:param re_sampling_rate: Re-sampling rate for the output audio.
514-
:type re_sampling_rate: Optional[int]
515513
:param recording_devices: The identifiers of the recording devices to simulate.
516514
:type recording_devices: Optional[List[Union[RecordingDevice, str]]]
517515
:param impulse_response_database: The database for impulse responses.
@@ -520,6 +518,21 @@ def to_audio(
520518
:type override_tts_audio: Optional[bool]
521519
:param verbose: Verbose mode for logging.
522520
:type verbose: Optional[bool]
521+
:param overlap_pauses: Generate the audio with overlapping and pausing between turns using LLM.
522+
:type overlap_pauses: Optional[bool]
523+
:param add_sound_effects: Add sound effects (such as door opening, footsteps, etc.) to the audio.
524+
:type add_sound_effects: Optional[bool]
525+
:param sound_effects_dropout: Dropout rate for sound effects.
526+
:type sound_effects_dropout: Optional[float]
527+
:param skip_annotation: Whether to skip the annotation of the sound effects
528+
(if your dialogs are already annotated with sound effects tags, you can skip this step).
529+
:type skip_annotation: Optional[bool]
530+
:param remove_silences: Remove the silences at the beginning and the end of the audio.
531+
:type remove_silences: Optional[bool]
532+
:param callback_mix_fn: Callback function to apply to the mixed audio.
533+
:type callback_mix_fn: Optional[Callable]
534+
:param callback_mix_kwargs: Keyword arguments for the callback function.
535+
:type callback_mix_kwargs: dict
523536
:return: Audio dialogue with processed audio data.
524537
:rtype: "sdialog.audio.dialog.AudioDialog"
525538
:raises Exception: If the audio module is not installed.

src/sdialog/audio/__init__.py

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,14 @@
5858
import numpy as np
5959
from tqdm import tqdm
6060
import soundfile as sf
61-
from typing import Union
61+
from typing import Union, Optional, Callable
6262

6363
from sdialog.audio.tts import BaseTTS
64-
from sdialog.audio.dialog import AudioDialog
6564
from sdialog.audio.room import Room, RoomPosition
66-
from sdialog.audio.utils import AudioUtils, SourceVolume, Role, logger
6765
from sdialog.audio.acoustics_simulator import AcousticsSimulator
6866
from sdialog.audio.voice_database import BaseVoiceDatabase, Voice
67+
from sdialog.audio.dialog import AudioDialog, RoomAcousticsConfig
68+
from sdialog.audio.utils import SourceVolume, Role, logger
6969

7070
device = "cuda" if torch.cuda.is_available() else "cpu"
7171

@@ -79,7 +79,8 @@ def generate_utterances_audios(
7979
keep_duplicate: bool = False,
8080
seed: int = None,
8181
sampling_rate: int = 24_000,
82-
tts_pipeline_kwargs: dict = {}
82+
tts_pipeline_kwargs: dict = {},
83+
remove_silences: bool = True
8384
) -> AudioDialog:
8485
"""
8586
Generates audio for each utterance in an AudioDialog object using the specified TTS engine.
@@ -113,6 +114,8 @@ def generate_utterances_audios(
113114
:type seed: int
114115
:param sampling_rate: Sampling rate for the audio generation.
115116
:type sampling_rate: int
117+
:param remove_silences: If True, remove the silences at the beginning and the end of the audio.
118+
:type remove_silences: bool
116119
:return: The AudioDialog object with generated audio for each turn.
117120
:rtype: AudioDialog
118121
"""
@@ -137,7 +140,7 @@ def generate_utterances_audios(
137140

138141
# Generate the utterance audio
139142
utterance_audio, utterance_sampling_rate = generate_utterance(
140-
text=AudioUtils.remove_audio_tags(turn.text),
143+
text=turn.text,
141144
voice=turn.voice,
142145
tts_pipeline=tts_pipeline,
143146
tts_pipeline_kwargs=tts_pipeline_kwargs
@@ -156,9 +159,16 @@ def generate_utterances_audios(
156159
target_sr=sampling_rate,
157160
)
158161

162+
# Remove the silences at the beginning and the end of the audio
163+
if remove_silences:
164+
utterance_audio, _ = librosa.effects.trim(utterance_audio, top_db=60)
165+
159166
# Set the utterance audio to the turn
160167
turn.set_audio(utterance_audio, sampling_rate)
161168

169+
# Set the audio duration of the turn
170+
turn.audio_duration = utterance_audio.shape[0] / sampling_rate
171+
162172
return dialog
163173

164174

@@ -188,7 +198,12 @@ def generate_utterance(
188198
:return: A tuple containing the audio data as a numpy array and the sampling rate.
189199
:rtype: tuple[np.ndarray, int]
190200
"""
191-
return tts_pipeline.generate(text, speaker_voice=voice, tts_pipeline_kwargs=tts_pipeline_kwargs)
201+
audio, sr = tts_pipeline.generate(text, speaker_voice=voice, tts_pipeline_kwargs=tts_pipeline_kwargs)
202+
203+
if isinstance(audio, torch.Tensor):
204+
audio = audio.cpu().numpy()
205+
206+
return audio, sr
192207

193208

194209
def generate_audio_room_accoustic(
@@ -201,7 +216,9 @@ def generate_audio_room_accoustic(
201216
audio_file_format: str = "wav",
202217
background_effect: str = "white_noise",
203218
foreground_effect: str = "ac_noise_minimal",
204-
foreground_effect_position: RoomPosition = RoomPosition.TOP_RIGHT
219+
foreground_effect_position: RoomPosition = RoomPosition.TOP_RIGHT,
220+
callback_mix_fn: Optional[Callable] = None,
221+
callback_mix_kwargs: dict = {}
205222
) -> AudioDialog:
206223
"""
207224
Generates room acoustics simulation for the dialogue audio.
@@ -237,16 +254,29 @@ def generate_audio_room_accoustic(
237254
:type foreground_effect: str
238255
:param foreground_effect_position: Position for foreground effects.
239256
:type foreground_effect_position: RoomPosition
257+
:param callback_mix_fn: Callback function to apply to the mixed audio.
258+
:type callback_mix_fn: Optional[Callable]
259+
:param callback_mix_kwargs: Keyword arguments for the callback function.
260+
:type callback_mix_kwargs: dict
240261
:return: The AudioDialog with room acoustics simulation results and file paths.
241262
:rtype: AudioDialog
242263
"""
243264

244265
# Create the room acoustics simulator
245266
room_acoustics = AcousticsSimulator(room=room, kwargs_pyroom=kwargs_pyroom)
246267

268+
# Prepare callback kwargs
269+
_callback_mix_kwargs = callback_mix_kwargs.copy() if callback_mix_kwargs is not None else {}
270+
271+
# Add dialog to kwargs if not present
272+
if "dialog" not in _callback_mix_kwargs:
273+
_callback_mix_kwargs["dialog"] = dialog
274+
247275
_audio_accoustic = room_acoustics.simulate(
248276
sources=dialog.get_audio_sources(),
249-
source_volumes=source_volumes
277+
source_volumes=source_volumes,
278+
callback_mix_fn=callback_mix_fn,
279+
callback_mix_kwargs=_callback_mix_kwargs,
250280
)
251281

252282
# Save the audio file
@@ -270,28 +300,28 @@ def generate_audio_room_accoustic(
270300
# If the audio paths post processing are already in the dialog, use them, otherwise create a new dictionary
271301
if (
272302
room_name in dialog.audio_step_3_filepaths
273-
and "audio_paths_post_processing" in dialog.audio_step_3_filepaths[room_name]
274-
and dialog.audio_step_3_filepaths[room_name]["audio_paths_post_processing"] != {}
303+
and dialog.audio_step_3_filepaths[room_name].audio_paths_post_processing is not None
304+
and dialog.audio_step_3_filepaths[room_name].audio_paths_post_processing != {}
275305
):
276-
audio_paths_post_processing = dialog.audio_step_3_filepaths[room_name]["audio_paths_post_processing"]
306+
audio_paths_post_processing = dialog.audio_step_3_filepaths[room_name].audio_paths_post_processing
277307
logger.info(
278308
f"Existing audio paths for the post processing stage "
279309
f"already exist for room name: '{room_name}' and are kept unchanged"
280310
)
281311
else:
282312
audio_paths_post_processing = {}
283313

284-
dialog.audio_step_3_filepaths[room_name] = {
285-
"audio_path": current_room_audio_path,
286-
"microphone_position": room.mic_position,
287-
"room_name": room_name,
288-
"room": room,
289-
"source_volumes": source_volumes,
290-
"kwargs_pyroom": kwargs_pyroom,
291-
"background_effect": background_effect,
292-
"foreground_effect": foreground_effect,
293-
"foreground_effect_position": foreground_effect_position,
294-
"audio_paths_post_processing": audio_paths_post_processing
295-
}
314+
dialog.audio_step_3_filepaths[room_name] = RoomAcousticsConfig(
315+
audio_path=current_room_audio_path,
316+
microphone_position=room.mic_position,
317+
room_name=room_name,
318+
room=room,
319+
source_volumes=source_volumes,
320+
kwargs_pyroom=kwargs_pyroom,
321+
background_effect=background_effect,
322+
foreground_effect=foreground_effect,
323+
foreground_effect_position=foreground_effect_position,
324+
audio_paths_post_processing=audio_paths_post_processing,
325+
)
296326

297327
return dialog

src/sdialog/audio/acoustics_simulator.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@
4949
import os
5050
import numpy as np
5151
import soundfile as sf
52-
from typing import List
52+
from typing import List, Callable, Optional
5353

5454
from sdialog.audio.utils import logger, SourceVolume
55-
from sdialog.audio.room import Room, AudioSource, RoomPosition, DirectivityType
55+
from sdialog.audio.room import Room, AudioSource, RoomPosition, DirectivityType, Position3D
5656

5757

5858
class AcousticsSimulator:
@@ -230,6 +230,8 @@ def _add_sources(
230230

231231
for i, audio_source in enumerate(audiosources):
232232

233+
# audio_source.position = audio_source.position.replace("sfx|", "")
234+
233235
self.audiosources.append(audio_source)
234236

235237
# Get the position of the audio source
@@ -242,6 +244,22 @@ def _add_sources(
242244
elif audio_source.position.startswith("speaker_"): # speaker_ is the speaker sound
243245
_position3d = self.room.speakers_positions[audio_source.position]
244246

247+
# Check if the position corresponds to a furniture
248+
elif audio_source.position in self.room.furnitures:
249+
furniture = self.room.furnitures[audio_source.position]
250+
_position3d = Position3D(
251+
furniture.x + furniture.width / 2,
252+
furniture.y + furniture.depth / 2,
253+
furniture.get_top_z()
254+
)
255+
256+
else:
257+
logger.warning(
258+
f"Unknown position '{audio_source.position}' for audio source '{audio_source.name}'. "
259+
"Placing it at the center of the room."
260+
)
261+
_position3d = self.room.room_position_to_position3d(RoomPosition.CENTER)
262+
245263
# Load the audio file from the file system for the audio source
246264
if audio_source.source_file and os.path.exists(audio_source.source_file):
247265

@@ -279,7 +297,9 @@ def simulate(
279297
self,
280298
sources: List[AudioSource] = [],
281299
source_volumes: dict[str, SourceVolume] = {},
282-
reset: bool = False
300+
reset: bool = False,
301+
callback_mix_fn: Optional[Callable] = None,
302+
callback_mix_kwargs: Optional[dict] = None
283303
):
284304
"""
285305
Simulates room acoustics for the given audio sources.
@@ -301,6 +321,10 @@ def simulate(
301321
:type source_volumes: dict[str, SourceVolume]
302322
:param reset: If True, resets the room acoustics simulator before simulation.
303323
:type reset: bool
324+
:param callback_mix_fn: Callback function to apply to the mixed audio.
325+
:type callback_mix_fn: Optional[Callable]
326+
:param callback_mix_kwargs: Keyword arguments for the callback function.
327+
:type callback_mix_kwargs: dict
304328
:return: Processed audio with room acoustics effects applied.
305329
:rtype: np.ndarray
306330
:raises ValueError: If audio sources are invalid or empty.
@@ -316,7 +340,10 @@ def simulate(
316340
self._add_sources(sources, source_volumes)
317341

318342
logger.info("[Step 3] Simulating room acoustics...")
319-
self._pyroom.simulate()
343+
self._pyroom.simulate(
344+
callback_mix=callback_mix_fn if callback_mix_fn is not None else None,
345+
callback_mix_kwargs=callback_mix_kwargs if callback_mix_fn is not None else {}
346+
)
320347

321348
except ValueError as e:
322349

@@ -361,6 +388,7 @@ def reset(self):
361388

362389
del self._pyroom
363390
self._pyroom = None
391+
self.audiosources = []
364392

365393
@staticmethod
366394
def apply_snr(x, snr):

0 commit comments

Comments
 (0)