Skip to content

Commit d61ddb9

Browse files
Add support to voice database for voice cloning
Additionally, now you can pass raw (array, sample rate) as voices: ```python dialog.to_audio( "path/to/output/audio", voices={ "Doctor": (wav1, sr), "Patient": (wav2, sr), } ) ```
1 parent f38fdaf commit d61ddb9

File tree

3 files changed

+18
-9
lines changed

3 files changed

+18
-9
lines changed

src/sdialog/audio/dialog.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,9 @@ def persona_to_voice(
506506
persona["voice"] = voices[role]
507507

508508
# If the voice of the speaker is provided as an identifier (like "am_echo")
509-
elif isinstance(voices[role], tuple):
509+
# check if first item is str to avoid (array, sampling_rate) tuple cases
510+
elif (isinstance(voices[role], tuple) and isinstance(voices[role][0], str)
511+
and voice_database is not None):
510512
_identifier, _language = voices[role]
511513
persona["voice"] = voice_database.get_voice_by_identifier(
512514
_identifier,

src/sdialog/audio/tts/qwen3/tts.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,12 @@ def generate(self,
9999
if "language" not in tts_pipeline_kwargs:
100100
tts_pipeline_kwargs["language"] = "English"
101101

102-
if type(speaker_voice) is str:
103-
tts_pipeline_kwargs["ref_audio"] = speaker_voice # Path to reference audio
104-
tts_pipeline_kwargs["ref_text"] = text # TODO: should be the transcription of ref_audio
105-
elif speaker_voice is not None:
106-
tts_pipeline_kwargs["voice_clone_prompt"] = speaker_voice
102+
if speaker_voice is not None:
103+
tts_pipeline_kwargs["ref_audio"] = speaker_voice # Path to reference audio or (array, sampling_rate) tuple
104+
# tts_pipeline_kwargs["ref_text"] = ref_text # TODO: should be the transcription of ref_audio
105+
tts_pipeline_kwargs["x_vector_only_mode"] = True
106+
else:
107+
raise ValueError("speaker_voice must be provided for voice cloning in Qwen3TTSVoiceClone")
107108

108109
wavs, sr = self.model.generate_voice_clone(
109110
text=text,

src/sdialog/audio/voice_database.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@
5151
# SPDX-License-Identifier: MIT
5252
import os
5353
import random
54-
from typing import List, Union
5554
from pydantic import BaseModel
55+
from typing import List, Union, Any
5656
from collections import defaultdict, Counter
5757

5858
from sdialog.audio.utils import logger
@@ -130,7 +130,7 @@ class Voice(BaseModel):
130130
gender: str = ""
131131
age: int = 0
132132
identifier: str = ""
133-
voice: str # Can be a path or the voice string
133+
voice: Any # Can be a path, the voice string, or (array, sampling_rate) tuple
134134
language: str = "english"
135135
language_code: str = "a"
136136

@@ -750,7 +750,13 @@ def populate(self) -> dict:
750750
)
751751

752752
if "audio" in d and d["audio"] is not None:
753-
_voice = d["audio"]["path"]
753+
if "array" in d["audio"] and "sampling_rate" in d["audio"]:
754+
_voice = (d["audio"]["array"], d["audio"]["sampling_rate"])
755+
elif "path" in d["audio"] and d["audio"]["path"] is not None:
756+
_voice = d["audio"]["path"]
757+
else:
758+
raise ValueError("Audio field found but does not contain valid audio data "
759+
"(missing 'array'/'sampling_rate' or 'path')")
754760
elif "voice" in d and d["voice"] is not None:
755761
_voice = d["voice"]
756762
else:

0 commit comments

Comments
 (0)