Add support to voice database for voice cloning

sergioburdisso · sergioburdisso · commit d61ddb9148bf · 2026-02-13T10:47:15.000+01:00
Additionally, now you can pass raw (array, sample rate) as voices:

```python
dialog.to_audio(
    "path/to/output/audio",
    voices={
        "Doctor": (wav1, sr),
        "Patient":  (wav2, sr),
    }
)
```
diff --git a/src/sdialog/audio/dialog.py b/src/sdialog/audio/dialog.py
@@ -506,7 +506,9 @@ def persona_to_voice(
                 persona["voice"] = voices[role]
 
             # If the voice of the speaker is provided as an identifier (like "am_echo")
-            elif isinstance(voices[role], tuple):
+            # check if first item is str to avoid (array, sampling_rate) tuple cases
+            elif (isinstance(voices[role], tuple) and isinstance(voices[role][0], str)
+                  and voice_database is not None):
                 _identifier, _language = voices[role]
                 persona["voice"] = voice_database.get_voice_by_identifier(
                     _identifier,
diff --git a/src/sdialog/audio/tts/qwen3/tts.py b/src/sdialog/audio/tts/qwen3/tts.py
@@ -99,11 +99,12 @@ def generate(self,
         if "language" not in tts_pipeline_kwargs:
             tts_pipeline_kwargs["language"] = "English"
 
-        if type(speaker_voice) is str:
-            tts_pipeline_kwargs["ref_audio"] = speaker_voice  # Path to reference audio
-            tts_pipeline_kwargs["ref_text"] = text  # TODO: should be the transcription of ref_audio
-        elif speaker_voice is not None:
-            tts_pipeline_kwargs["voice_clone_prompt"] = speaker_voice
+        if speaker_voice is not None:
+            tts_pipeline_kwargs["ref_audio"] = speaker_voice  # Path to reference audio or (array, sampling_rate) tuple
+            # tts_pipeline_kwargs["ref_text"] = ref_text  # TODO: should be the transcription of ref_audio
+            tts_pipeline_kwargs["x_vector_only_mode"] = True
+        else:
+            raise ValueError("speaker_voice must be provided for voice cloning in Qwen3TTSVoiceClone")
 
         wavs, sr = self.model.generate_voice_clone(
             text=text,
diff --git a/src/sdialog/audio/voice_database.py b/src/sdialog/audio/voice_database.py
@@ -51,8 +51,8 @@
 # SPDX-License-Identifier: MIT
 import os
 import random
-from typing import List, Union
 from pydantic import BaseModel
+from typing import List, Union, Any
 from collections import defaultdict, Counter
 
 from sdialog.audio.utils import logger
@@ -130,7 +130,7 @@ class Voice(BaseModel):
     gender: str = ""
     age: int = 0
     identifier: str = ""
-    voice: str  # Can be a path or the voice string
+    voice: Any  # Can be a path, the voice string, or (array, sampling_rate) tuple
     language: str = "english"
     language_code: str = "a"
 
@@ -750,7 +750,13 @@ def populate(self) -> dict:
                 )
 
             if "audio" in d and d["audio"] is not None:
-                _voice = d["audio"]["path"]
+                if "array" in d["audio"] and "sampling_rate" in d["audio"]:
+                    _voice = (d["audio"]["array"], d["audio"]["sampling_rate"])
+                elif "path" in d["audio"] and d["audio"]["path"] is not None:
+                    _voice = d["audio"]["path"]
+                else:
+                    raise ValueError("Audio field found but does not contain valid audio data "
+                                     "(missing 'array'/'sampling_rate' or 'path')")
             elif "voice" in d and d["voice"] is not None:
                 _voice = d["voice"]
             else: