33Supports all 3 task types:
44 - CustomVoice: Predefined speaker with optional style instructions
55 - VoiceDesign: Natural language voice description
6- - Base: Voice cloning from reference audio
6+ - Base: Voice cloning from reference audio (upload or URL)
7+
8+ Features:
9+ - Streaming audio output (progressive playback via PCM chunks)
10+ - Online voice cloning via reference audio URL
711
812Usage:
913 # Start the server first (see run_server.sh), then:
3741
3842TASK_TYPES = ["CustomVoice" , "VoiceDesign" , "Base" ]
3943
44+ PCM_SAMPLE_RATE = 24000
45+
4046
4147def fetch_voices (api_base : str ) -> list [str ]:
4248 """Fetch available voices from the server."""
@@ -71,35 +77,36 @@ def encode_audio_to_base64(audio_data: tuple) -> str:
7177 return f"data:audio/wav;base64,{ wav_b64 } "
7278
7379
74- def generate_speech (
75- api_base : str ,
80+ def _build_payload (
7681 text : str ,
7782 task_type : str ,
7883 voice : str ,
7984 language : str ,
8085 instructions : str ,
8186 ref_audio : tuple | None ,
87+ ref_audio_url : str ,
8288 ref_text : str ,
8389 response_format : str ,
8490 speed : float ,
85- ):
86- """Call /v1/audio/speech and return audio for Gradio."""
91+ stream : bool ,
92+ ) -> dict :
93+ """Build the /v1/audio/speech request payload."""
8794 if not text or not text .strip ():
8895 raise gr .Error ("Please enter text to synthesize." )
8996
90- # Build request payload
9197 payload = {
9298 "input" : text .strip (),
93- "response_format" : response_format ,
94- "speed " : speed ,
99+ "response_format" : "pcm" if stream else response_format ,
100+ "stream " : stream ,
95101 }
102+ if not stream :
103+ payload ["speed" ] = speed
96104
97105 if task_type :
98106 payload ["task_type" ] = task_type
99107 if language :
100108 payload ["language" ] = language
101109
102- # Task-specific parameters
103110 if task_type == "CustomVoice" :
104111 if voice :
105112 payload ["voice" ] = voice
@@ -112,13 +119,47 @@ def generate_speech(
112119 payload ["instructions" ] = instructions .strip ()
113120
114121 elif task_type == "Base" :
115- if ref_audio is None :
116- raise gr .Error ("Base (voice clone) task requires reference audio." )
117- payload ["ref_audio" ] = encode_audio_to_base64 (ref_audio )
122+ ref_audio_url_stripped = ref_audio_url .strip () if ref_audio_url else ""
123+ if ref_audio_url_stripped :
124+ payload ["ref_audio" ] = ref_audio_url_stripped
125+ elif ref_audio is not None :
126+ payload ["ref_audio" ] = encode_audio_to_base64 (ref_audio )
127+ else :
128+ raise gr .Error ("Base (voice clone) task requires reference audio. Upload a file or provide a URL." )
118129 if ref_text and ref_text .strip ():
119130 payload ["ref_text" ] = ref_text .strip ()
120131
121- # Call the API
132+ return payload
133+
134+
135+ def generate_speech (
136+ api_base : str ,
137+ text : str ,
138+ task_type : str ,
139+ voice : str ,
140+ language : str ,
141+ instructions : str ,
142+ ref_audio : tuple | None ,
143+ ref_audio_url : str ,
144+ ref_text : str ,
145+ response_format : str ,
146+ speed : float ,
147+ ):
148+ """Non-streaming: call /v1/audio/speech and return audio."""
149+ payload = _build_payload (
150+ text ,
151+ task_type ,
152+ voice ,
153+ language ,
154+ instructions ,
155+ ref_audio ,
156+ ref_audio_url ,
157+ ref_text ,
158+ response_format ,
159+ speed ,
160+ stream = False ,
161+ )
162+
122163 try :
123164 with httpx .Client (timeout = 300.0 ) as client :
124165 resp = client .post (
@@ -137,7 +178,6 @@ def generate_speech(
137178 if resp .status_code != 200 :
138179 raise gr .Error (f"Server error ({ resp .status_code } ): { resp .text } " )
139180
140- # Check for JSON error response
141181 content_type = resp .headers .get ("content-type" , "" )
142182 if "application/json" in content_type :
143183 try :
@@ -146,8 +186,10 @@ def generate_speech(
146186 except ValueError :
147187 pass
148188
149- # Decode audio response
150189 try :
190+ if response_format == "pcm" :
191+ audio_np = np .frombuffer (resp .content , dtype = np .int16 ).astype (np .float32 ) / 32767.0
192+ return (PCM_SAMPLE_RATE , audio_np )
151193 audio_np , sample_rate = sf .read (io .BytesIO (resp .content ))
152194 if audio_np .ndim > 1 :
153195 audio_np = audio_np [:, 0 ]
@@ -156,34 +198,107 @@ def generate_speech(
156198 raise gr .Error (f"Failed to decode audio response: { e } " )
157199
158200
201+ def generate_speech_stream (
202+ api_base : str ,
203+ text : str ,
204+ task_type : str ,
205+ voice : str ,
206+ language : str ,
207+ instructions : str ,
208+ ref_audio : tuple | None ,
209+ ref_audio_url : str ,
210+ ref_text : str ,
211+ response_format : str ,
212+ speed : float ,
213+ ):
214+ """Streaming: yield progressive audio as PCM chunks arrive."""
215+ payload = _build_payload (
216+ text ,
217+ task_type ,
218+ voice ,
219+ language ,
220+ instructions ,
221+ ref_audio ,
222+ ref_audio_url ,
223+ ref_text ,
224+ response_format ,
225+ speed ,
226+ stream = True ,
227+ )
228+
229+ all_samples = []
230+ try :
231+ with httpx .Client (timeout = 300.0 ) as client :
232+ with client .stream (
233+ "POST" ,
234+ f"{ api_base } /v1/audio/speech" ,
235+ json = payload ,
236+ headers = {
237+ "Content-Type" : "application/json" ,
238+ "Authorization" : "Bearer EMPTY" ,
239+ },
240+ ) as resp :
241+ if resp .status_code != 200 :
242+ resp .read ()
243+ raise gr .Error (f"Server error ({ resp .status_code } ): { resp .text } " )
244+ for chunk in resp .iter_bytes ():
245+ if not chunk :
246+ continue
247+ samples = np .frombuffer (chunk , dtype = np .int16 ).astype (np .float32 ) / 32767.0
248+ all_samples .append (samples )
249+ combined = np .concatenate (all_samples )
250+ yield (PCM_SAMPLE_RATE , combined )
251+ except httpx .TimeoutException :
252+ raise gr .Error ("Request timed out. The server may be busy." )
253+ except httpx .ConnectError :
254+ raise gr .Error (f"Cannot connect to server at { api_base } . Make sure the vLLM server is running." )
255+
256+
159257def on_task_type_change (task_type : str ):
160258 """Update UI visibility based on selected task type."""
161259 if task_type == "CustomVoice" :
162260 return (
163261 gr .update (visible = True ), # voice dropdown
164262 gr .update (visible = True , info = "Optional style/emotion instructions" ),
165263 gr .update (visible = False ), # ref_audio
264+ gr .update (visible = False ), # ref_audio_url
166265 gr .update (visible = False ), # ref_text
167266 )
168267 elif task_type == "VoiceDesign" :
169268 return (
170- gr .update (visible = False ), # voice dropdown
269+ gr .update (visible = False ),
171270 gr .update (visible = True , info = "Required: describe the voice style" ),
172- gr .update (visible = False ), # ref_audio
173- gr .update (visible = False ), # ref_text
271+ gr .update (visible = False ),
272+ gr .update (visible = False ),
273+ gr .update (visible = False ),
174274 )
175275 elif task_type == "Base" :
176276 return (
177- gr .update (visible = False ), # voice dropdown
178- gr .update (visible = False ), # instructions
179- gr .update (visible = True ), # ref_audio
180- gr .update (visible = True ), # ref_text
277+ gr .update (visible = False ),
278+ gr .update (visible = False ),
279+ gr .update (visible = True ),
280+ gr .update (visible = True ),
281+ gr .update (visible = True ),
181282 )
182283 return (
183284 gr .update (visible = True ),
184285 gr .update (visible = True ),
185286 gr .update (visible = False ),
186287 gr .update (visible = False ),
288+ gr .update (visible = False ),
289+ )
290+
291+
292+ def on_stream_change (stream : bool ):
293+ """When streaming is enabled, lock format to PCM and disable speed."""
294+ if stream :
295+ return (
296+ gr .update (value = "pcm" , interactive = False ),
297+ gr .update (interactive = False ),
298+ )
299+ return (
300+ gr .update (value = "wav" , interactive = True ),
301+ gr .update (interactive = True ),
187302 )
188303
189304
@@ -235,19 +350,25 @@ def build_interface(api_base: str):
235350 # Instructions (CustomVoice optional, VoiceDesign required)
236351 instructions = gr .Textbox (
237352 label = "Instructions" ,
238- placeholder = ( "e.g., Speak with excitement / A warm, friendly female voice" ) ,
353+ placeholder = "e.g., Speak with excitement / A warm, friendly female voice" ,
239354 lines = 2 ,
240355 visible = True ,
241356 info = "Optional style/emotion instructions" ,
242357 )
243358
244359 # Base (voice clone) controls
245360 ref_audio = gr .Audio (
246- label = "Reference Audio (for voice cloning)" ,
361+ label = "Reference Audio (upload for voice cloning)" ,
247362 type = "numpy" ,
248363 sources = ["upload" , "microphone" ],
249364 visible = False ,
250365 )
366+ ref_audio_url = gr .Textbox (
367+ label = "Reference Audio URL" ,
368+ placeholder = "https://example.com/reference.wav (alternative to uploading)" ,
369+ lines = 1 ,
370+ visible = False ,
371+ )
251372 ref_text = gr .Textbox (
252373 label = "Reference Audio Transcript" ,
253374 placeholder = "Transcript of the reference audio (optional, improves quality)" ,
@@ -270,6 +391,12 @@ def build_interface(api_base: str):
270391 label = "Speed" ,
271392 scale = 1 ,
272393 )
394+ stream_checkbox = gr .Checkbox (
395+ label = "Stream output" ,
396+ value = False ,
397+ info = "Enable streaming (uses PCM format, speed control disabled)" ,
398+ scale = 1 ,
399+ )
273400
274401 generate_btn = gr .Button (
275402 "Generate Speech" ,
@@ -283,37 +410,53 @@ def build_interface(api_base: str):
283410 audio_output = gr .Audio (
284411 label = "Generated Audio" ,
285412 interactive = False ,
413+ streaming = True ,
286414 )
287415 gr .Markdown (
288416 "### Task Types\n "
289417 "- **CustomVoice**: Use a predefined speaker "
290418 "(Vivian, Ryan, etc.) with optional style instructions\n "
291419 "- **VoiceDesign**: Describe the desired voice in natural "
292420 "language (instructions required)\n "
293- "- **Base**: Clone a voice from reference audio"
421+ "- **Base**: Clone a voice from reference audio "
422+ "(upload a file or provide a URL)"
294423 )
295424
296425 # Dynamic UI updates
297426 task_type .change (
298427 fn = on_task_type_change ,
299428 inputs = [task_type ],
300- outputs = [voice , instructions , ref_audio , ref_text ],
429+ outputs = [voice , instructions , ref_audio , ref_audio_url , ref_text ],
430+ )
431+
432+ stream_checkbox .change (
433+ fn = on_stream_change ,
434+ inputs = [stream_checkbox ],
435+ outputs = [response_format , speed ],
301436 )
302437
303- # Generate button
438+ all_inputs = [
439+ text_input ,
440+ task_type ,
441+ voice ,
442+ language ,
443+ instructions ,
444+ ref_audio ,
445+ ref_audio_url ,
446+ ref_text ,
447+ response_format ,
448+ speed ,
449+ ]
450+
451+ def dispatch (stream_enabled , * args ):
452+ if stream_enabled :
453+ yield from generate_speech_stream (api_base , * args )
454+ else :
455+ yield generate_speech (api_base , * args )
456+
304457 generate_btn .click (
305- fn = lambda * args : generate_speech (api_base , * args ),
306- inputs = [
307- text_input ,
308- task_type ,
309- voice ,
310- language ,
311- instructions ,
312- ref_audio ,
313- ref_text ,
314- response_format ,
315- speed ,
316- ],
458+ fn = dispatch ,
459+ inputs = [stream_checkbox ] + all_inputs ,
317460 outputs = [audio_output ],
318461 )
319462
0 commit comments