33Supports all 3 task types:
44 - CustomVoice: Predefined speaker with optional style instructions
55 - VoiceDesign: Natural language voice description
6- - Base: Voice cloning from reference audio
6+ - Base: Voice cloning from reference audio (upload or URL)
7+
8+ Features:
9+ - Streaming audio output (progressive playback via PCM chunks)
10+ - Online voice cloning via reference audio URL
711
812Usage:
913 # Start the server first (see run_server.sh), then:
3741
3842TASK_TYPES = ["CustomVoice" , "VoiceDesign" , "Base" ]
3943
44+ PCM_SAMPLE_RATE = 24000
45+
4046
4147def fetch_voices (api_base : str ) -> list [str ]:
4248 """Fetch available voices from the server."""
@@ -71,35 +77,36 @@ def encode_audio_to_base64(audio_data: tuple) -> str:
7177 return f"data:audio/wav;base64,{ wav_b64 } "
7278
7379
74- def generate_speech (
75- api_base : str ,
80+ def _build_payload (
7681 text : str ,
7782 task_type : str ,
7883 voice : str ,
7984 language : str ,
8085 instructions : str ,
8186 ref_audio : tuple | None ,
87+ ref_audio_url : str ,
8288 ref_text : str ,
8389 response_format : str ,
8490 speed : float ,
85- ):
86- """Call /v1/audio/speech and return audio for Gradio."""
91+ stream : bool ,
92+ ) -> dict :
93+ """Build the /v1/audio/speech request payload."""
8794 if not text or not text .strip ():
8895 raise gr .Error ("Please enter text to synthesize." )
8996
90- # Build request payload
9197 payload = {
9298 "input" : text .strip (),
93- "response_format" : response_format ,
94- "speed " : speed ,
99+ "response_format" : "pcm" if stream else response_format ,
100+ "stream " : stream ,
95101 }
102+ if not stream :
103+ payload ["speed" ] = speed
96104
97105 if task_type :
98106 payload ["task_type" ] = task_type
99107 if language :
100108 payload ["language" ] = language
101109
102- # Task-specific parameters
103110 if task_type == "CustomVoice" :
104111 if voice :
105112 payload ["voice" ] = voice
@@ -112,13 +119,42 @@ def generate_speech(
112119 payload ["instructions" ] = instructions .strip ()
113120
114121 elif task_type == "Base" :
115- if ref_audio is None :
116- raise gr .Error ("Base (voice clone) task requires reference audio." )
117- payload ["ref_audio" ] = encode_audio_to_base64 (ref_audio )
122+ ref_audio_url_stripped = ref_audio_url .strip () if ref_audio_url else ""
123+ if ref_audio_url_stripped :
124+ payload ["ref_audio" ] = ref_audio_url_stripped
125+ elif ref_audio is not None :
126+ payload ["ref_audio" ] = encode_audio_to_base64 (ref_audio )
127+ else :
128+ raise gr .Error (
129+ "Base (voice clone) task requires reference audio. "
130+ "Upload a file or provide a URL."
131+ )
118132 if ref_text and ref_text .strip ():
119133 payload ["ref_text" ] = ref_text .strip ()
120134
121- # Call the API
135+ return payload
136+
137+
138+ def generate_speech (
139+ api_base : str ,
140+ text : str ,
141+ task_type : str ,
142+ voice : str ,
143+ language : str ,
144+ instructions : str ,
145+ ref_audio : tuple | None ,
146+ ref_audio_url : str ,
147+ ref_text : str ,
148+ response_format : str ,
149+ speed : float ,
150+ ):
151+ """Non-streaming: call /v1/audio/speech and return audio."""
152+ payload = _build_payload (
153+ text , task_type , voice , language , instructions ,
154+ ref_audio , ref_audio_url , ref_text , response_format , speed ,
155+ stream = False ,
156+ )
157+
122158 try :
123159 with httpx .Client (timeout = 300.0 ) as client :
124160 resp = client .post (
@@ -132,12 +168,14 @@ def generate_speech(
132168 except httpx .TimeoutException :
133169 raise gr .Error ("Request timed out. The server may be busy." )
134170 except httpx .ConnectError :
135- raise gr .Error (f"Cannot connect to server at { api_base } . Make sure the vLLM server is running." )
171+ raise gr .Error (
172+ f"Cannot connect to server at { api_base } . "
173+ "Make sure the vLLM server is running."
174+ )
136175
137176 if resp .status_code != 200 :
138177 raise gr .Error (f"Server error ({ resp .status_code } ): { resp .text } " )
139178
140- # Check for JSON error response
141179 content_type = resp .headers .get ("content-type" , "" )
142180 if "application/json" in content_type :
143181 try :
@@ -146,8 +184,13 @@ def generate_speech(
146184 except ValueError :
147185 pass
148186
149- # Decode audio response
150187 try :
188+ if response_format == "pcm" :
189+ audio_np = (
190+ np .frombuffer (resp .content , dtype = np .int16 ).astype (np .float32 )
191+ / 32767.0
192+ )
193+ return (PCM_SAMPLE_RATE , audio_np )
151194 audio_np , sample_rate = sf .read (io .BytesIO (resp .content ))
152195 if audio_np .ndim > 1 :
153196 audio_np = audio_np [:, 0 ]
@@ -156,34 +199,107 @@ def generate_speech(
156199 raise gr .Error (f"Failed to decode audio response: { e } " )
157200
158201
202+ def generate_speech_stream (
203+ api_base : str ,
204+ text : str ,
205+ task_type : str ,
206+ voice : str ,
207+ language : str ,
208+ instructions : str ,
209+ ref_audio : tuple | None ,
210+ ref_audio_url : str ,
211+ ref_text : str ,
212+ response_format : str ,
213+ speed : float ,
214+ ):
215+ """Streaming: yield progressive audio as PCM chunks arrive."""
216+ payload = _build_payload (
217+ text , task_type , voice , language , instructions ,
218+ ref_audio , ref_audio_url , ref_text , response_format , speed ,
219+ stream = True ,
220+ )
221+
222+ all_samples = []
223+ try :
224+ with httpx .Client (timeout = 300.0 ) as client :
225+ with client .stream (
226+ "POST" ,
227+ f"{ api_base } /v1/audio/speech" ,
228+ json = payload ,
229+ headers = {
230+ "Content-Type" : "application/json" ,
231+ "Authorization" : "Bearer EMPTY" ,
232+ },
233+ ) as resp :
234+ if resp .status_code != 200 :
235+ resp .read ()
236+ raise gr .Error (
237+ f"Server error ({ resp .status_code } ): { resp .text } "
238+ )
239+ for chunk in resp .iter_bytes ():
240+ if not chunk :
241+ continue
242+ samples = (
243+ np .frombuffer (chunk , dtype = np .int16 ).astype (np .float32 )
244+ / 32767.0
245+ )
246+ all_samples .append (samples )
247+ combined = np .concatenate (all_samples )
248+ yield (PCM_SAMPLE_RATE , combined )
249+ except httpx .TimeoutException :
250+ raise gr .Error ("Request timed out. The server may be busy." )
251+ except httpx .ConnectError :
252+ raise gr .Error (
253+ f"Cannot connect to server at { api_base } . "
254+ "Make sure the vLLM server is running."
255+ )
256+
257+
159258def on_task_type_change (task_type : str ):
160259 """Update UI visibility based on selected task type."""
161260 if task_type == "CustomVoice" :
162261 return (
163- gr .update (visible = True ), # voice dropdown
262+ gr .update (visible = True ), # voice dropdown
164263 gr .update (visible = True , info = "Optional style/emotion instructions" ),
165264 gr .update (visible = False ), # ref_audio
265+ gr .update (visible = False ), # ref_audio_url
166266 gr .update (visible = False ), # ref_text
167267 )
168268 elif task_type == "VoiceDesign" :
169269 return (
170- gr .update (visible = False ), # voice dropdown
270+ gr .update (visible = False ),
171271 gr .update (visible = True , info = "Required: describe the voice style" ),
172- gr .update (visible = False ), # ref_audio
173- gr .update (visible = False ), # ref_text
272+ gr .update (visible = False ),
273+ gr .update (visible = False ),
274+ gr .update (visible = False ),
174275 )
175276 elif task_type == "Base" :
176277 return (
177- gr .update (visible = False ), # voice dropdown
178- gr .update (visible = False ), # instructions
179- gr .update (visible = True ), # ref_audio
180- gr .update (visible = True ), # ref_text
278+ gr .update (visible = False ),
279+ gr .update (visible = False ),
280+ gr .update (visible = True ),
281+ gr .update (visible = True ),
282+ gr .update (visible = True ),
181283 )
182284 return (
183285 gr .update (visible = True ),
184286 gr .update (visible = True ),
185287 gr .update (visible = False ),
186288 gr .update (visible = False ),
289+ gr .update (visible = False ),
290+ )
291+
292+
293+ def on_stream_change (stream : bool ):
294+ """When streaming is enabled, lock format to PCM and disable speed."""
295+ if stream :
296+ return (
297+ gr .update (value = "pcm" , interactive = False ),
298+ gr .update (interactive = False ),
299+ )
300+ return (
301+ gr .update (value = "wav" , interactive = True ),
302+ gr .update (interactive = True ),
187303 )
188304
189305
@@ -235,22 +351,31 @@ def build_interface(api_base: str):
235351 # Instructions (CustomVoice optional, VoiceDesign required)
236352 instructions = gr .Textbox (
237353 label = "Instructions" ,
238- placeholder = ("e.g., Speak with excitement / A warm, friendly female voice" ),
354+ placeholder = "e.g., Speak with excitement / "
355+ "A warm, friendly female voice" ,
239356 lines = 2 ,
240357 visible = True ,
241358 info = "Optional style/emotion instructions" ,
242359 )
243360
244361 # Base (voice clone) controls
245362 ref_audio = gr .Audio (
246- label = "Reference Audio (for voice cloning)" ,
363+ label = "Reference Audio (upload for voice cloning)" ,
247364 type = "numpy" ,
248365 sources = ["upload" , "microphone" ],
249366 visible = False ,
250367 )
368+ ref_audio_url = gr .Textbox (
369+ label = "Reference Audio URL" ,
370+ placeholder = "https://example.com/reference.wav "
371+ "(alternative to uploading)" ,
372+ lines = 1 ,
373+ visible = False ,
374+ )
251375 ref_text = gr .Textbox (
252376 label = "Reference Audio Transcript" ,
253- placeholder = "Transcript of the reference audio (optional, improves quality)" ,
377+ placeholder = "Transcript of the reference audio "
378+ "(optional, improves quality)" ,
254379 lines = 2 ,
255380 visible = False ,
256381 )
@@ -270,6 +395,13 @@ def build_interface(api_base: str):
270395 label = "Speed" ,
271396 scale = 1 ,
272397 )
398+ stream_checkbox = gr .Checkbox (
399+ label = "Stream output" ,
400+ value = False ,
401+ info = "Enable streaming (uses PCM format, "
402+ "speed control disabled)" ,
403+ scale = 1 ,
404+ )
273405
274406 generate_btn = gr .Button (
275407 "Generate Speech" ,
@@ -283,37 +415,53 @@ def build_interface(api_base: str):
283415 audio_output = gr .Audio (
284416 label = "Generated Audio" ,
285417 interactive = False ,
418+ streaming = True ,
286419 )
287420 gr .Markdown (
288421 "### Task Types\n "
289422 "- **CustomVoice**: Use a predefined speaker "
290423 "(Vivian, Ryan, etc.) with optional style instructions\n "
291424 "- **VoiceDesign**: Describe the desired voice in natural "
292425 "language (instructions required)\n "
293- "- **Base**: Clone a voice from reference audio"
426+ "- **Base**: Clone a voice from reference audio "
427+ "(upload a file or provide a URL)"
294428 )
295429
296430 # Dynamic UI updates
297431 task_type .change (
298432 fn = on_task_type_change ,
299433 inputs = [task_type ],
300- outputs = [voice , instructions , ref_audio , ref_text ],
434+ outputs = [voice , instructions , ref_audio , ref_audio_url , ref_text ],
301435 )
302436
303- # Generate button
437+ stream_checkbox .change (
438+ fn = on_stream_change ,
439+ inputs = [stream_checkbox ],
440+ outputs = [response_format , speed ],
441+ )
442+
443+ all_inputs = [
444+ text_input ,
445+ task_type ,
446+ voice ,
447+ language ,
448+ instructions ,
449+ ref_audio ,
450+ ref_audio_url ,
451+ ref_text ,
452+ response_format ,
453+ speed ,
454+ ]
455+
456+ def dispatch (stream_enabled , * args ):
457+ if stream_enabled :
458+ yield from generate_speech_stream (api_base , * args )
459+ else :
460+ yield generate_speech (api_base , * args )
461+
304462 generate_btn .click (
305- fn = lambda * args : generate_speech (api_base , * args ),
306- inputs = [
307- text_input ,
308- task_type ,
309- voice ,
310- language ,
311- instructions ,
312- ref_audio ,
313- ref_text ,
314- response_format ,
315- speed ,
316- ],
463+ fn = dispatch ,
464+ inputs = [stream_checkbox ] + all_inputs ,
317465 outputs = [audio_output ],
318466 )
319467
@@ -322,7 +470,9 @@ def build_interface(api_base: str):
322470
323471
324472def parse_args ():
325- parser = argparse .ArgumentParser (description = "Gradio demo for Qwen3-TTS online serving." )
473+ parser = argparse .ArgumentParser (
474+ description = "Gradio demo for Qwen3-TTS online serving."
475+ )
326476 parser .add_argument (
327477 "--api-base" ,
328478 default = "http://localhost:8000" ,
0 commit comments