@@ -216,7 +216,7 @@ async fn gsv_stable_tts(
216216 text : & str ,
217217 tts_resp_tx : & TTSResponseTx ,
218218) -> anyhow:: Result < ( ) > {
219- let bytes = retry_gsv_tts (
219+ let wav_data = retry_gsv_tts (
220220 client,
221221 & tts. url ,
222222 & tts. speaker ,
@@ -227,7 +227,7 @@ async fn gsv_stable_tts(
227227 )
228228 . await ?;
229229
230- tts_resp_tx . send ( bytes . to_vec ( ) ) ?;
230+ send_wav ( tts_resp_tx , wav_data ) . await ?;
231231 Ok ( ( ) )
232232}
233233
@@ -317,17 +317,17 @@ async fn groq_tts(
317317 text : & str ,
318318 tts_resp_tx : & TTSResponseTx ,
319319) -> anyhow:: Result < ( ) > {
320- let bytes =
320+ let wav_data =
321321 crate :: ai:: tts:: groq ( client, & tts. url , & tts. model , & tts. api_key , & tts. voice , text) . await ?;
322322
323- tts_resp_tx . send ( bytes . to_vec ( ) ) ?;
323+ send_wav ( tts_resp_tx , wav_data ) . await ?;
324324 Ok ( ( ) )
325325}
326326
327327async fn fish_tts ( tts : & FishTTS , text : & str , tts_resp_tx : & TTSResponseTx ) -> anyhow:: Result < ( ) > {
328- let bytes = crate :: ai:: tts:: fish_tts ( & tts. api_key , & tts. speaker , text) . await ?;
328+ let wav_data = crate :: ai:: tts:: fish_tts ( & tts. api_key , & tts. speaker , text) . await ?;
329329
330- tts_resp_tx . send ( bytes . to_vec ( ) ) ?;
330+ send_wav ( tts_resp_tx , wav_data ) . await ?;
331331 Ok ( ( ) )
332332}
333333
@@ -378,3 +378,40 @@ async fn elevenlabs_tts(
378378
379379 Ok ( ( ) )
380380}
381+
382+ async fn send_wav ( tts_resp_tx : & TTSResponseTx , wav_data : Bytes ) -> anyhow:: Result < ( ) > {
383+ let mut reader = wav_io:: reader:: Reader :: from_vec ( wav_data. into ( ) )
384+ . map_err ( |e| anyhow:: anyhow!( "wav_io reader error: {e}" ) ) ?;
385+
386+ let header = reader. read_header ( ) ?;
387+ let mut samples = crate :: util:: get_samples_f32 ( & mut reader)
388+ . map_err ( |e| anyhow:: anyhow!( "get_samples_f32 error: {e}" ) ) ?;
389+
390+ let out_hz = 16000 ;
391+
392+ if header. sample_rate != out_hz {
393+ // resample to 16000
394+ log:: debug!( "resampling from {} to 16000" , header. sample_rate) ;
395+ samples = wav_io:: resample:: linear ( samples, header. channels , header. sample_rate , out_hz) ;
396+ }
397+ let audio_16k = wav_io:: convert_samples_f32_to_i16 ( & samples) ;
398+
399+ for chunk in audio_16k. chunks ( 5 * out_hz as usize / 10 ) {
400+ let buff = if cfg ! ( target_endian = "big" ) {
401+ let mut buff = Vec :: with_capacity ( chunk. len ( ) * 2 ) ;
402+ for i in chunk {
403+ buff. extend_from_slice ( & i. to_le_bytes ( ) ) ;
404+ }
405+ buff
406+ } else {
407+ let chunk_bytes =
408+ unsafe { std:: slice:: from_raw_parts ( chunk. as_ptr ( ) as * const u8 , chunk. len ( ) * 2 ) } ;
409+ chunk_bytes. to_vec ( )
410+ } ;
411+
412+ // std::mem::swap(&mut send_data, &mut buff);
413+ tts_resp_tx. send ( buff) ?;
414+ }
415+
416+ Ok ( ( ) )
417+ }
0 commit comments