Skip to content

Commit 9af4eb0

Browse files
committed
feat: add streaming output and online voice clone to gradio demo
Signed-off-by: lishunyang <lishunyang12@163.com>
1 parent 31974c5 commit 9af4eb0

File tree

1 file changed

+193
-43
lines changed

1 file changed

+193
-43
lines changed

examples/online_serving/qwen3_tts/gradio_demo.py

Lines changed: 193 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
Supports all 3 task types:
44
- CustomVoice: Predefined speaker with optional style instructions
55
- VoiceDesign: Natural language voice description
6-
- Base: Voice cloning from reference audio
6+
- Base: Voice cloning from reference audio (upload or URL)
7+
8+
Features:
9+
- Streaming audio output (progressive playback via PCM chunks)
10+
- Online voice cloning via reference audio URL
711
812
Usage:
913
# Start the server first (see run_server.sh), then:
@@ -37,6 +41,8 @@
3741

3842
TASK_TYPES = ["CustomVoice", "VoiceDesign", "Base"]
3943

44+
PCM_SAMPLE_RATE = 24000
45+
4046

4147
def fetch_voices(api_base: str) -> list[str]:
4248
"""Fetch available voices from the server."""
@@ -71,35 +77,36 @@ def encode_audio_to_base64(audio_data: tuple) -> str:
7177
return f"data:audio/wav;base64,{wav_b64}"
7278

7379

74-
def generate_speech(
75-
api_base: str,
80+
def _build_payload(
7681
text: str,
7782
task_type: str,
7883
voice: str,
7984
language: str,
8085
instructions: str,
8186
ref_audio: tuple | None,
87+
ref_audio_url: str,
8288
ref_text: str,
8389
response_format: str,
8490
speed: float,
85-
):
86-
"""Call /v1/audio/speech and return audio for Gradio."""
91+
stream: bool,
92+
) -> dict:
93+
"""Build the /v1/audio/speech request payload."""
8794
if not text or not text.strip():
8895
raise gr.Error("Please enter text to synthesize.")
8996

90-
# Build request payload
9197
payload = {
9298
"input": text.strip(),
93-
"response_format": response_format,
94-
"speed": speed,
99+
"response_format": "pcm" if stream else response_format,
100+
"stream": stream,
95101
}
102+
if not stream:
103+
payload["speed"] = speed
96104

97105
if task_type:
98106
payload["task_type"] = task_type
99107
if language:
100108
payload["language"] = language
101109

102-
# Task-specific parameters
103110
if task_type == "CustomVoice":
104111
if voice:
105112
payload["voice"] = voice
@@ -112,13 +119,42 @@ def generate_speech(
112119
payload["instructions"] = instructions.strip()
113120

114121
elif task_type == "Base":
115-
if ref_audio is None:
116-
raise gr.Error("Base (voice clone) task requires reference audio.")
117-
payload["ref_audio"] = encode_audio_to_base64(ref_audio)
122+
ref_audio_url_stripped = ref_audio_url.strip() if ref_audio_url else ""
123+
if ref_audio_url_stripped:
124+
payload["ref_audio"] = ref_audio_url_stripped
125+
elif ref_audio is not None:
126+
payload["ref_audio"] = encode_audio_to_base64(ref_audio)
127+
else:
128+
raise gr.Error(
129+
"Base (voice clone) task requires reference audio. "
130+
"Upload a file or provide a URL."
131+
)
118132
if ref_text and ref_text.strip():
119133
payload["ref_text"] = ref_text.strip()
120134

121-
# Call the API
135+
return payload
136+
137+
138+
def generate_speech(
139+
api_base: str,
140+
text: str,
141+
task_type: str,
142+
voice: str,
143+
language: str,
144+
instructions: str,
145+
ref_audio: tuple | None,
146+
ref_audio_url: str,
147+
ref_text: str,
148+
response_format: str,
149+
speed: float,
150+
):
151+
"""Non-streaming: call /v1/audio/speech and return audio."""
152+
payload = _build_payload(
153+
text, task_type, voice, language, instructions,
154+
ref_audio, ref_audio_url, ref_text, response_format, speed,
155+
stream=False,
156+
)
157+
122158
try:
123159
with httpx.Client(timeout=300.0) as client:
124160
resp = client.post(
@@ -132,12 +168,14 @@ def generate_speech(
132168
except httpx.TimeoutException:
133169
raise gr.Error("Request timed out. The server may be busy.")
134170
except httpx.ConnectError:
135-
raise gr.Error(f"Cannot connect to server at {api_base}. Make sure the vLLM server is running.")
171+
raise gr.Error(
172+
f"Cannot connect to server at {api_base}. "
173+
"Make sure the vLLM server is running."
174+
)
136175

137176
if resp.status_code != 200:
138177
raise gr.Error(f"Server error ({resp.status_code}): {resp.text}")
139178

140-
# Check for JSON error response
141179
content_type = resp.headers.get("content-type", "")
142180
if "application/json" in content_type:
143181
try:
@@ -146,8 +184,13 @@ def generate_speech(
146184
except ValueError:
147185
pass
148186

149-
# Decode audio response
150187
try:
188+
if response_format == "pcm":
189+
audio_np = (
190+
np.frombuffer(resp.content, dtype=np.int16).astype(np.float32)
191+
/ 32767.0
192+
)
193+
return (PCM_SAMPLE_RATE, audio_np)
151194
audio_np, sample_rate = sf.read(io.BytesIO(resp.content))
152195
if audio_np.ndim > 1:
153196
audio_np = audio_np[:, 0]
@@ -156,34 +199,107 @@ def generate_speech(
156199
raise gr.Error(f"Failed to decode audio response: {e}")
157200

158201

202+
def generate_speech_stream(
203+
api_base: str,
204+
text: str,
205+
task_type: str,
206+
voice: str,
207+
language: str,
208+
instructions: str,
209+
ref_audio: tuple | None,
210+
ref_audio_url: str,
211+
ref_text: str,
212+
response_format: str,
213+
speed: float,
214+
):
215+
"""Streaming: yield progressive audio as PCM chunks arrive."""
216+
payload = _build_payload(
217+
text, task_type, voice, language, instructions,
218+
ref_audio, ref_audio_url, ref_text, response_format, speed,
219+
stream=True,
220+
)
221+
222+
all_samples = []
223+
try:
224+
with httpx.Client(timeout=300.0) as client:
225+
with client.stream(
226+
"POST",
227+
f"{api_base}/v1/audio/speech",
228+
json=payload,
229+
headers={
230+
"Content-Type": "application/json",
231+
"Authorization": "Bearer EMPTY",
232+
},
233+
) as resp:
234+
if resp.status_code != 200:
235+
resp.read()
236+
raise gr.Error(
237+
f"Server error ({resp.status_code}): {resp.text}"
238+
)
239+
for chunk in resp.iter_bytes():
240+
if not chunk:
241+
continue
242+
samples = (
243+
np.frombuffer(chunk, dtype=np.int16).astype(np.float32)
244+
/ 32767.0
245+
)
246+
all_samples.append(samples)
247+
combined = np.concatenate(all_samples)
248+
yield (PCM_SAMPLE_RATE, combined)
249+
except httpx.TimeoutException:
250+
raise gr.Error("Request timed out. The server may be busy.")
251+
except httpx.ConnectError:
252+
raise gr.Error(
253+
f"Cannot connect to server at {api_base}. "
254+
"Make sure the vLLM server is running."
255+
)
256+
257+
159258
def on_task_type_change(task_type: str):
160259
"""Update UI visibility based on selected task type."""
161260
if task_type == "CustomVoice":
162261
return (
163-
gr.update(visible=True), # voice dropdown
262+
gr.update(visible=True), # voice dropdown
164263
gr.update(visible=True, info="Optional style/emotion instructions"),
165264
gr.update(visible=False), # ref_audio
265+
gr.update(visible=False), # ref_audio_url
166266
gr.update(visible=False), # ref_text
167267
)
168268
elif task_type == "VoiceDesign":
169269
return (
170-
gr.update(visible=False), # voice dropdown
270+
gr.update(visible=False),
171271
gr.update(visible=True, info="Required: describe the voice style"),
172-
gr.update(visible=False), # ref_audio
173-
gr.update(visible=False), # ref_text
272+
gr.update(visible=False),
273+
gr.update(visible=False),
274+
gr.update(visible=False),
174275
)
175276
elif task_type == "Base":
176277
return (
177-
gr.update(visible=False), # voice dropdown
178-
gr.update(visible=False), # instructions
179-
gr.update(visible=True), # ref_audio
180-
gr.update(visible=True), # ref_text
278+
gr.update(visible=False),
279+
gr.update(visible=False),
280+
gr.update(visible=True),
281+
gr.update(visible=True),
282+
gr.update(visible=True),
181283
)
182284
return (
183285
gr.update(visible=True),
184286
gr.update(visible=True),
185287
gr.update(visible=False),
186288
gr.update(visible=False),
289+
gr.update(visible=False),
290+
)
291+
292+
293+
def on_stream_change(stream: bool):
294+
"""When streaming is enabled, lock format to PCM and disable speed."""
295+
if stream:
296+
return (
297+
gr.update(value="pcm", interactive=False),
298+
gr.update(interactive=False),
299+
)
300+
return (
301+
gr.update(value="wav", interactive=True),
302+
gr.update(interactive=True),
187303
)
188304

189305

@@ -235,22 +351,31 @@ def build_interface(api_base: str):
235351
# Instructions (CustomVoice optional, VoiceDesign required)
236352
instructions = gr.Textbox(
237353
label="Instructions",
238-
placeholder=("e.g., Speak with excitement / A warm, friendly female voice"),
354+
placeholder="e.g., Speak with excitement / "
355+
"A warm, friendly female voice",
239356
lines=2,
240357
visible=True,
241358
info="Optional style/emotion instructions",
242359
)
243360

244361
# Base (voice clone) controls
245362
ref_audio = gr.Audio(
246-
label="Reference Audio (for voice cloning)",
363+
label="Reference Audio (upload for voice cloning)",
247364
type="numpy",
248365
sources=["upload", "microphone"],
249366
visible=False,
250367
)
368+
ref_audio_url = gr.Textbox(
369+
label="Reference Audio URL",
370+
placeholder="https://example.com/reference.wav "
371+
"(alternative to uploading)",
372+
lines=1,
373+
visible=False,
374+
)
251375
ref_text = gr.Textbox(
252376
label="Reference Audio Transcript",
253-
placeholder="Transcript of the reference audio (optional, improves quality)",
377+
placeholder="Transcript of the reference audio "
378+
"(optional, improves quality)",
254379
lines=2,
255380
visible=False,
256381
)
@@ -270,6 +395,13 @@ def build_interface(api_base: str):
270395
label="Speed",
271396
scale=1,
272397
)
398+
stream_checkbox = gr.Checkbox(
399+
label="Stream output",
400+
value=False,
401+
info="Enable streaming (uses PCM format, "
402+
"speed control disabled)",
403+
scale=1,
404+
)
273405

274406
generate_btn = gr.Button(
275407
"Generate Speech",
@@ -283,37 +415,53 @@ def build_interface(api_base: str):
283415
audio_output = gr.Audio(
284416
label="Generated Audio",
285417
interactive=False,
418+
streaming=True,
286419
)
287420
gr.Markdown(
288421
"### Task Types\n"
289422
"- **CustomVoice**: Use a predefined speaker "
290423
"(Vivian, Ryan, etc.) with optional style instructions\n"
291424
"- **VoiceDesign**: Describe the desired voice in natural "
292425
"language (instructions required)\n"
293-
"- **Base**: Clone a voice from reference audio"
426+
"- **Base**: Clone a voice from reference audio "
427+
"(upload a file or provide a URL)"
294428
)
295429

296430
# Dynamic UI updates
297431
task_type.change(
298432
fn=on_task_type_change,
299433
inputs=[task_type],
300-
outputs=[voice, instructions, ref_audio, ref_text],
434+
outputs=[voice, instructions, ref_audio, ref_audio_url, ref_text],
301435
)
302436

303-
# Generate button
437+
stream_checkbox.change(
438+
fn=on_stream_change,
439+
inputs=[stream_checkbox],
440+
outputs=[response_format, speed],
441+
)
442+
443+
all_inputs = [
444+
text_input,
445+
task_type,
446+
voice,
447+
language,
448+
instructions,
449+
ref_audio,
450+
ref_audio_url,
451+
ref_text,
452+
response_format,
453+
speed,
454+
]
455+
456+
def dispatch(stream_enabled, *args):
457+
if stream_enabled:
458+
yield from generate_speech_stream(api_base, *args)
459+
else:
460+
yield generate_speech(api_base, *args)
461+
304462
generate_btn.click(
305-
fn=lambda *args: generate_speech(api_base, *args),
306-
inputs=[
307-
text_input,
308-
task_type,
309-
voice,
310-
language,
311-
instructions,
312-
ref_audio,
313-
ref_text,
314-
response_format,
315-
speed,
316-
],
463+
fn=dispatch,
464+
inputs=[stream_checkbox] + all_inputs,
317465
outputs=[audio_output],
318466
)
319467

@@ -322,7 +470,9 @@ def build_interface(api_base: str):
322470

323471

324472
def parse_args():
325-
parser = argparse.ArgumentParser(description="Gradio demo for Qwen3-TTS online serving.")
473+
parser = argparse.ArgumentParser(
474+
description="Gradio demo for Qwen3-TTS online serving."
475+
)
326476
parser.add_argument(
327477
"--api-base",
328478
default="http://localhost:8000",

0 commit comments

Comments
 (0)