File tree Expand file tree Collapse file tree 3 files changed +18
-9
lines changed
Expand file tree Collapse file tree 3 files changed +18
-9
lines changed Original file line number Diff line number Diff line change @@ -506,7 +506,9 @@ def persona_to_voice(
506506 persona ["voice" ] = voices [role ]
507507
508508 # If the voice of the speaker is provided as an identifier (like "am_echo")
509- elif isinstance (voices [role ], tuple ):
509+ # check if first item is str to avoid (array, sampling_rate) tuple cases
510+ elif (isinstance (voices [role ], tuple ) and isinstance (voices [role ][0 ], str )
511+ and voice_database is not None ):
510512 _identifier , _language = voices [role ]
511513 persona ["voice" ] = voice_database .get_voice_by_identifier (
512514 _identifier ,
Original file line number Diff line number Diff line change @@ -99,11 +99,12 @@ def generate(self,
9999 if "language" not in tts_pipeline_kwargs :
100100 tts_pipeline_kwargs ["language" ] = "English"
101101
102- if type (speaker_voice ) is str :
103- tts_pipeline_kwargs ["ref_audio" ] = speaker_voice # Path to reference audio
104- tts_pipeline_kwargs ["ref_text" ] = text # TODO: should be the transcription of ref_audio
105- elif speaker_voice is not None :
106- tts_pipeline_kwargs ["voice_clone_prompt" ] = speaker_voice
102+ if speaker_voice is not None :
103+ tts_pipeline_kwargs ["ref_audio" ] = speaker_voice # Path to reference audio or (array, sampling_rate) tuple
104+ # tts_pipeline_kwargs["ref_text"] = ref_text # TODO: should be the transcription of ref_audio
105+ tts_pipeline_kwargs ["x_vector_only_mode" ] = True
106+ else :
107+ raise ValueError ("speaker_voice must be provided for voice cloning in Qwen3TTSVoiceClone" )
107108
108109 wavs , sr = self .model .generate_voice_clone (
109110 text = text ,
Original file line number Diff line number Diff line change 5151# SPDX-License-Identifier: MIT
5252import os
5353import random
54- from typing import List , Union
5554from pydantic import BaseModel
55+ from typing import List , Union , Any
5656from collections import defaultdict , Counter
5757
5858from sdialog .audio .utils import logger
@@ -130,7 +130,7 @@ class Voice(BaseModel):
130130 gender : str = ""
131131 age : int = 0
132132 identifier : str = ""
133- voice : str # Can be a path or the voice string
133+ voice : Any # Can be a path, the voice string, or (array, sampling_rate) tuple
134134 language : str = "english"
135135 language_code : str = "a"
136136
@@ -750,7 +750,13 @@ def populate(self) -> dict:
750750 )
751751
752752 if "audio" in d and d ["audio" ] is not None :
753- _voice = d ["audio" ]["path" ]
753+ if "array" in d ["audio" ] and "sampling_rate" in d ["audio" ]:
754+ _voice = (d ["audio" ]["array" ], d ["audio" ]["sampling_rate" ])
755+ elif "path" in d ["audio" ] and d ["audio" ]["path" ] is not None :
756+ _voice = d ["audio" ]["path" ]
757+ else :
758+ raise ValueError ("Audio field found but does not contain valid audio data "
759+ "(missing 'array'/'sampling_rate' or 'path')" )
754760 elif "voice" in d and d ["voice" ] is not None :
755761 _voice = d ["voice" ]
756762 else :
You can’t perform that action at this time.
0 commit comments