Skip to content

Commit 86b5291

Browse files
committed
feat: add streaming output and online voice clone to gradio demo
Signed-off-by: lishunyang <lishunyang12@163.com>
1 parent 31974c5 commit 86b5291

File tree

1 file changed

+182
-39
lines changed

1 file changed

+182
-39
lines changed

examples/online_serving/qwen3_tts/gradio_demo.py

Lines changed: 182 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
Supports all 3 task types:
44
- CustomVoice: Predefined speaker with optional style instructions
55
- VoiceDesign: Natural language voice description
6-
- Base: Voice cloning from reference audio
6+
- Base: Voice cloning from reference audio (upload or URL)
7+
8+
Features:
9+
- Streaming audio output (progressive playback via PCM chunks)
10+
- Online voice cloning via reference audio URL
711
812
Usage:
913
# Start the server first (see run_server.sh), then:
@@ -37,6 +41,8 @@
3741

3842
TASK_TYPES = ["CustomVoice", "VoiceDesign", "Base"]
3943

44+
PCM_SAMPLE_RATE = 24000
45+
4046

4147
def fetch_voices(api_base: str) -> list[str]:
4248
"""Fetch available voices from the server."""
@@ -71,35 +77,36 @@ def encode_audio_to_base64(audio_data: tuple) -> str:
7177
return f"data:audio/wav;base64,{wav_b64}"
7278

7379

74-
def generate_speech(
75-
api_base: str,
80+
def _build_payload(
7681
text: str,
7782
task_type: str,
7883
voice: str,
7984
language: str,
8085
instructions: str,
8186
ref_audio: tuple | None,
87+
ref_audio_url: str,
8288
ref_text: str,
8389
response_format: str,
8490
speed: float,
85-
):
86-
"""Call /v1/audio/speech and return audio for Gradio."""
91+
stream: bool,
92+
) -> dict:
93+
"""Build the /v1/audio/speech request payload."""
8794
if not text or not text.strip():
8895
raise gr.Error("Please enter text to synthesize.")
8996

90-
# Build request payload
9197
payload = {
9298
"input": text.strip(),
93-
"response_format": response_format,
94-
"speed": speed,
99+
"response_format": "pcm" if stream else response_format,
100+
"stream": stream,
95101
}
102+
if not stream:
103+
payload["speed"] = speed
96104

97105
if task_type:
98106
payload["task_type"] = task_type
99107
if language:
100108
payload["language"] = language
101109

102-
# Task-specific parameters
103110
if task_type == "CustomVoice":
104111
if voice:
105112
payload["voice"] = voice
@@ -112,13 +119,47 @@ def generate_speech(
112119
payload["instructions"] = instructions.strip()
113120

114121
elif task_type == "Base":
115-
if ref_audio is None:
116-
raise gr.Error("Base (voice clone) task requires reference audio.")
117-
payload["ref_audio"] = encode_audio_to_base64(ref_audio)
122+
ref_audio_url_stripped = ref_audio_url.strip() if ref_audio_url else ""
123+
if ref_audio_url_stripped:
124+
payload["ref_audio"] = ref_audio_url_stripped
125+
elif ref_audio is not None:
126+
payload["ref_audio"] = encode_audio_to_base64(ref_audio)
127+
else:
128+
raise gr.Error("Base (voice clone) task requires reference audio. Upload a file or provide a URL.")
118129
if ref_text and ref_text.strip():
119130
payload["ref_text"] = ref_text.strip()
120131

121-
# Call the API
132+
return payload
133+
134+
135+
def generate_speech(
136+
api_base: str,
137+
text: str,
138+
task_type: str,
139+
voice: str,
140+
language: str,
141+
instructions: str,
142+
ref_audio: tuple | None,
143+
ref_audio_url: str,
144+
ref_text: str,
145+
response_format: str,
146+
speed: float,
147+
):
148+
"""Non-streaming: call /v1/audio/speech and return audio."""
149+
payload = _build_payload(
150+
text,
151+
task_type,
152+
voice,
153+
language,
154+
instructions,
155+
ref_audio,
156+
ref_audio_url,
157+
ref_text,
158+
response_format,
159+
speed,
160+
stream=False,
161+
)
162+
122163
try:
123164
with httpx.Client(timeout=300.0) as client:
124165
resp = client.post(
@@ -137,7 +178,6 @@ def generate_speech(
137178
if resp.status_code != 200:
138179
raise gr.Error(f"Server error ({resp.status_code}): {resp.text}")
139180

140-
# Check for JSON error response
141181
content_type = resp.headers.get("content-type", "")
142182
if "application/json" in content_type:
143183
try:
@@ -146,8 +186,10 @@ def generate_speech(
146186
except ValueError:
147187
pass
148188

149-
# Decode audio response
150189
try:
190+
if response_format == "pcm":
191+
audio_np = np.frombuffer(resp.content, dtype=np.int16).astype(np.float32) / 32767.0
192+
return (PCM_SAMPLE_RATE, audio_np)
151193
audio_np, sample_rate = sf.read(io.BytesIO(resp.content))
152194
if audio_np.ndim > 1:
153195
audio_np = audio_np[:, 0]
@@ -156,34 +198,107 @@ def generate_speech(
156198
raise gr.Error(f"Failed to decode audio response: {e}")
157199

158200

201+
def generate_speech_stream(
202+
api_base: str,
203+
text: str,
204+
task_type: str,
205+
voice: str,
206+
language: str,
207+
instructions: str,
208+
ref_audio: tuple | None,
209+
ref_audio_url: str,
210+
ref_text: str,
211+
response_format: str,
212+
speed: float,
213+
):
214+
"""Streaming: yield progressive audio as PCM chunks arrive."""
215+
payload = _build_payload(
216+
text,
217+
task_type,
218+
voice,
219+
language,
220+
instructions,
221+
ref_audio,
222+
ref_audio_url,
223+
ref_text,
224+
response_format,
225+
speed,
226+
stream=True,
227+
)
228+
229+
all_samples = []
230+
try:
231+
with httpx.Client(timeout=300.0) as client:
232+
with client.stream(
233+
"POST",
234+
f"{api_base}/v1/audio/speech",
235+
json=payload,
236+
headers={
237+
"Content-Type": "application/json",
238+
"Authorization": "Bearer EMPTY",
239+
},
240+
) as resp:
241+
if resp.status_code != 200:
242+
resp.read()
243+
raise gr.Error(f"Server error ({resp.status_code}): {resp.text}")
244+
for chunk in resp.iter_bytes():
245+
if not chunk:
246+
continue
247+
samples = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767.0
248+
all_samples.append(samples)
249+
combined = np.concatenate(all_samples)
250+
yield (PCM_SAMPLE_RATE, combined)
251+
except httpx.TimeoutException:
252+
raise gr.Error("Request timed out. The server may be busy.")
253+
except httpx.ConnectError:
254+
raise gr.Error(f"Cannot connect to server at {api_base}. Make sure the vLLM server is running.")
255+
256+
159257
def on_task_type_change(task_type: str):
160258
"""Update UI visibility based on selected task type."""
161259
if task_type == "CustomVoice":
162260
return (
163261
gr.update(visible=True), # voice dropdown
164262
gr.update(visible=True, info="Optional style/emotion instructions"),
165263
gr.update(visible=False), # ref_audio
264+
gr.update(visible=False), # ref_audio_url
166265
gr.update(visible=False), # ref_text
167266
)
168267
elif task_type == "VoiceDesign":
169268
return (
170-
gr.update(visible=False), # voice dropdown
269+
gr.update(visible=False),
171270
gr.update(visible=True, info="Required: describe the voice style"),
172-
gr.update(visible=False), # ref_audio
173-
gr.update(visible=False), # ref_text
271+
gr.update(visible=False),
272+
gr.update(visible=False),
273+
gr.update(visible=False),
174274
)
175275
elif task_type == "Base":
176276
return (
177-
gr.update(visible=False), # voice dropdown
178-
gr.update(visible=False), # instructions
179-
gr.update(visible=True), # ref_audio
180-
gr.update(visible=True), # ref_text
277+
gr.update(visible=False),
278+
gr.update(visible=False),
279+
gr.update(visible=True),
280+
gr.update(visible=True),
281+
gr.update(visible=True),
181282
)
182283
return (
183284
gr.update(visible=True),
184285
gr.update(visible=True),
185286
gr.update(visible=False),
186287
gr.update(visible=False),
288+
gr.update(visible=False),
289+
)
290+
291+
292+
def on_stream_change(stream: bool):
293+
"""When streaming is enabled, lock format to PCM and disable speed."""
294+
if stream:
295+
return (
296+
gr.update(value="pcm", interactive=False),
297+
gr.update(interactive=False),
298+
)
299+
return (
300+
gr.update(value="wav", interactive=True),
301+
gr.update(interactive=True),
187302
)
188303

189304

@@ -235,19 +350,25 @@ def build_interface(api_base: str):
235350
# Instructions (CustomVoice optional, VoiceDesign required)
236351
instructions = gr.Textbox(
237352
label="Instructions",
238-
placeholder=("e.g., Speak with excitement / A warm, friendly female voice"),
353+
placeholder="e.g., Speak with excitement / A warm, friendly female voice",
239354
lines=2,
240355
visible=True,
241356
info="Optional style/emotion instructions",
242357
)
243358

244359
# Base (voice clone) controls
245360
ref_audio = gr.Audio(
246-
label="Reference Audio (for voice cloning)",
361+
label="Reference Audio (upload for voice cloning)",
247362
type="numpy",
248363
sources=["upload", "microphone"],
249364
visible=False,
250365
)
366+
ref_audio_url = gr.Textbox(
367+
label="Reference Audio URL",
368+
placeholder="https://example.com/reference.wav (alternative to uploading)",
369+
lines=1,
370+
visible=False,
371+
)
251372
ref_text = gr.Textbox(
252373
label="Reference Audio Transcript",
253374
placeholder="Transcript of the reference audio (optional, improves quality)",
@@ -270,6 +391,12 @@ def build_interface(api_base: str):
270391
label="Speed",
271392
scale=1,
272393
)
394+
stream_checkbox = gr.Checkbox(
395+
label="Stream output",
396+
value=False,
397+
info="Enable streaming (uses PCM format, speed control disabled)",
398+
scale=1,
399+
)
273400

274401
generate_btn = gr.Button(
275402
"Generate Speech",
@@ -283,37 +410,53 @@ def build_interface(api_base: str):
283410
audio_output = gr.Audio(
284411
label="Generated Audio",
285412
interactive=False,
413+
streaming=True,
286414
)
287415
gr.Markdown(
288416
"### Task Types\n"
289417
"- **CustomVoice**: Use a predefined speaker "
290418
"(Vivian, Ryan, etc.) with optional style instructions\n"
291419
"- **VoiceDesign**: Describe the desired voice in natural "
292420
"language (instructions required)\n"
293-
"- **Base**: Clone a voice from reference audio"
421+
"- **Base**: Clone a voice from reference audio "
422+
"(upload a file or provide a URL)"
294423
)
295424

296425
# Dynamic UI updates
297426
task_type.change(
298427
fn=on_task_type_change,
299428
inputs=[task_type],
300-
outputs=[voice, instructions, ref_audio, ref_text],
429+
outputs=[voice, instructions, ref_audio, ref_audio_url, ref_text],
430+
)
431+
432+
stream_checkbox.change(
433+
fn=on_stream_change,
434+
inputs=[stream_checkbox],
435+
outputs=[response_format, speed],
301436
)
302437

303-
# Generate button
438+
all_inputs = [
439+
text_input,
440+
task_type,
441+
voice,
442+
language,
443+
instructions,
444+
ref_audio,
445+
ref_audio_url,
446+
ref_text,
447+
response_format,
448+
speed,
449+
]
450+
451+
def dispatch(stream_enabled, *args):
452+
if stream_enabled:
453+
yield from generate_speech_stream(api_base, *args)
454+
else:
455+
yield generate_speech(api_base, *args)
456+
304457
generate_btn.click(
305-
fn=lambda *args: generate_speech(api_base, *args),
306-
inputs=[
307-
text_input,
308-
task_type,
309-
voice,
310-
language,
311-
instructions,
312-
ref_audio,
313-
ref_text,
314-
response_format,
315-
speed,
316-
],
458+
fn=dispatch,
459+
inputs=[stream_checkbox] + all_inputs,
317460
outputs=[audio_output],
318461
)
319462

0 commit comments

Comments
 (0)