1- #api for 240604 release version by Xiaokai
1+ # api for 240604 release version by Xiaokai
22import os
33import sys
44import json
2525# Define FastAPI app
2626app = FastAPI ()
2727
28+
2829class GUIConfig :
2930 def __init__ (self ) -> None :
3031 self .pth_path : str = ""
@@ -46,6 +47,7 @@ def __init__(self) -> None:
4647 self .sg_input_device : str = ""
4748 self .sg_output_device : str = ""
4849
50+
4951class ConfigData (BaseModel ):
5052 pth_path : str
5153 index_path : str
@@ -65,6 +67,7 @@ class ConfigData(BaseModel):
6567 use_pv : bool = False
6668 f0method : str = "fcpe"
6769
70+
6871class Harvest (Process ):
6972 def __init__ (self , inp_q , opt_q ):
7073 super (Harvest , self ).__init__ ()
@@ -74,6 +77,7 @@ def __init__(self, inp_q, opt_q):
7477 def run (self ):
7578 import numpy as np
7679 import pyworld
80+
7781 while True :
7882 idx , x , res_f0 , n_cpu , ts = self .inp_q .get ()
7983 f0 , t = pyworld .harvest (
@@ -87,6 +91,7 @@ def run(self):
8791 if len (res_f0 .keys ()) >= n_cpu :
8892 self .opt_q .put (ts )
8993
94+
9095class AudioAPI :
9196 def __init__ (self ) -> None :
9297 self .gui_config = GUIConfig ()
@@ -110,15 +115,15 @@ def initialize_queues(self):
110115 def load (self ):
111116 input_devices , output_devices , _ , _ = self .get_devices ()
112117 try :
113- with open ("configs/config.json" , "r" , encoding = ' utf-8' ) as j :
118+ with open ("configs/config.json" , "r" , encoding = " utf-8" ) as j :
114119 data = json .load (j )
115120 if data ["sg_input_device" ] not in input_devices :
116121 data ["sg_input_device" ] = input_devices [sd .default .device [0 ]]
117122 if data ["sg_output_device" ] not in output_devices :
118123 data ["sg_output_device" ] = output_devices [sd .default .device [1 ]]
119124 except Exception as e :
120125 logger .error (f"Failed to load configuration: { e } " )
121- with open ("configs/config.json" , "w" , encoding = ' utf-8' ) as j :
126+ with open ("configs/config.json" , "w" , encoding = " utf-8" ) as j :
122127 data = {
123128 "pth_path" : "" ,
124129 "index_path" : "" ,
@@ -188,9 +193,7 @@ def start_vc(self):
188193 self .block_frame = (
189194 int (
190195 np .round (
191- self .gui_config .block_time
192- * self .gui_config .samplerate
193- / self .zc
196+ self .gui_config .block_time * self .gui_config .samplerate / self .zc
194197 )
195198 )
196199 * self .zc
@@ -211,9 +214,7 @@ def start_vc(self):
211214 self .extra_frame = (
212215 int (
213216 np .round (
214- self .gui_config .extra_time
215- * self .gui_config .samplerate
216- / self .zc
217+ self .gui_config .extra_time * self .gui_config .samplerate / self .zc
217218 )
218219 )
219220 * self .zc
@@ -292,12 +293,16 @@ def soundinput(self):
292293 logger .info ("Audio block passed." )
293294 logger .info ("Ending VC" )
294295
295- def audio_callback (self , indata : np .ndarray , outdata : np .ndarray , frames , times , status ):
296+ def audio_callback (
297+ self , indata : np .ndarray , outdata : np .ndarray , frames , times , status
298+ ):
296299 start_time = time .perf_counter ()
297300 indata = librosa .to_mono (indata .T )
298301 if self .gui_config .threhold > - 60 :
299302 indata = np .append (self .rms_buffer , indata )
300- rms = librosa .feature .rms (y = indata , frame_length = 4 * self .zc , hop_length = self .zc )[:, 2 :]
303+ rms = librosa .feature .rms (
304+ y = indata , frame_length = 4 * self .zc , hop_length = self .zc
305+ )[:, 2 :]
301306 self .rms_buffer [:] = indata [- 4 * self .zc :]
302307 indata = indata [2 * self .zc - self .zc // 2 :]
303308 db_threhold = (
@@ -308,13 +313,21 @@ def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times,
308313 indata [i * self .zc : (i + 1 ) * self .zc ] = 0
309314 indata = indata [self .zc // 2 :]
310315 self .input_wav [: - self .block_frame ] = self .input_wav [self .block_frame :].clone ()
311- self .input_wav [- indata .shape [0 ] :] = torch .from_numpy (indata ).to (self .config .device )
312- self .input_wav_res [: - self .block_frame_16k ] = self .input_wav_res [self .block_frame_16k :].clone ()
316+ self .input_wav [- indata .shape [0 ] :] = torch .from_numpy (indata ).to (
317+ self .config .device
318+ )
319+ self .input_wav_res [: - self .block_frame_16k ] = self .input_wav_res [
320+ self .block_frame_16k :
321+ ].clone ()
313322 # input noise reduction and resampling
314323 if self .gui_config .I_noise_reduce :
315- self .input_wav_denoise [: - self .block_frame ] = self .input_wav_denoise [self .block_frame :].clone ()
324+ self .input_wav_denoise [: - self .block_frame ] = self .input_wav_denoise [
325+ self .block_frame :
326+ ].clone ()
316327 input_wav = self .input_wav [- self .sola_buffer_frame - self .block_frame :]
317- input_wav = self .tg (input_wav .unsqueeze (0 ), self .input_wav .unsqueeze (0 )).squeeze (0 )
328+ input_wav = self .tg (
329+ input_wav .unsqueeze (0 ), self .input_wav .unsqueeze (0 )
330+ ).squeeze (0 )
318331 input_wav [: self .sola_buffer_frame ] *= self .fade_in_window
319332 input_wav [: self .sola_buffer_frame ] += self .nr_buffer * self .fade_out_window
320333 self .input_wav_denoise [- self .block_frame :] = input_wav [: self .block_frame ]
@@ -343,9 +356,13 @@ def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times,
343356 infer_wav = self .input_wav [self .extra_frame :].clone ()
344357 # output noise reduction
345358 if self .gui_config .O_noise_reduce and self .function == "vc" :
346- self .output_buffer [: - self .block_frame ] = self .output_buffer [self .block_frame :].clone ()
359+ self .output_buffer [: - self .block_frame ] = self .output_buffer [
360+ self .block_frame :
361+ ].clone ()
347362 self .output_buffer [- self .block_frame :] = infer_wav [- self .block_frame :]
348- infer_wav = self .tg (infer_wav .unsqueeze (0 ), self .output_buffer .unsqueeze (0 )).squeeze (0 )
363+ infer_wav = self .tg (
364+ infer_wav .unsqueeze (0 ), self .output_buffer .unsqueeze (0 )
365+ ).squeeze (0 )
349366 # volume envelop mixing
350367 if self .gui_config .rms_mix_rate < 1 and self .function == "vc" :
351368 if self .gui_config .I_noise_reduce :
@@ -381,7 +398,9 @@ def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times,
381398 rms1 / rms2 , torch .tensor (1 - self .gui_config .rms_mix_rate )
382399 )
383400 # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
384- conv_input = infer_wav [None , None , : self .sola_buffer_frame + self .sola_search_frame ]
401+ conv_input = infer_wav [
402+ None , None , : self .sola_buffer_frame + self .sola_search_frame
403+ ]
385404 cor_nom = F .conv1d (conv_input , self .sola_buffer [None , None , :])
386405 cor_den = torch .sqrt (
387406 F .conv1d (
@@ -399,7 +418,9 @@ def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times,
399418 infer_wav = infer_wav [sola_offset :]
400419 if "privateuseone" in str (self .config .device ) or not self .gui_config .use_pv :
401420 infer_wav [: self .sola_buffer_frame ] *= self .fade_in_window
402- infer_wav [: self .sola_buffer_frame ] += self .sola_buffer * self .fade_out_window
421+ infer_wav [: self .sola_buffer_frame ] += (
422+ self .sola_buffer * self .fade_out_window
423+ )
403424 else :
404425 infer_wav [: self .sola_buffer_frame ] = phase_vocoder (
405426 self .sola_buffer ,
@@ -466,20 +487,34 @@ def set_devices(self, input_device, output_device):
466487 logger .debug (f"Selected output device: { output_device } " )
467488
468489 if input_device not in input_devices :
469- logger .error (f"Input device '{ input_device } ' is not in the list of available devices" )
470- raise HTTPException (status_code = 400 , detail = f"Input device '{ input_device } ' is not available" )
471-
490+ logger .error (
491+ f"Input device '{ input_device } ' is not in the list of available devices"
492+ )
493+ raise HTTPException (
494+ status_code = 400 ,
495+ detail = f"Input device '{ input_device } ' is not available" ,
496+ )
497+
472498 if output_device not in output_devices :
473- logger .error (f"Output device '{ output_device } ' is not in the list of available devices" )
474- raise HTTPException (status_code = 400 , detail = f"Output device '{ output_device } ' is not available" )
499+ logger .error (
500+ f"Output device '{ output_device } ' is not in the list of available devices"
501+ )
502+ raise HTTPException (
503+ status_code = 400 ,
504+ detail = f"Output device '{ output_device } ' is not available" ,
505+ )
475506
476507 sd .default .device [0 ] = input_device_indices [input_devices .index (input_device )]
477- sd .default .device [1 ] = output_device_indices [output_devices .index (output_device )]
508+ sd .default .device [1 ] = output_device_indices [
509+ output_devices .index (output_device )
510+ ]
478511 logger .info (f"Input device set to { sd .default .device [0 ]} : { input_device } " )
479512 logger .info (f"Output device set to { sd .default .device [1 ]} : { output_device } " )
480513
514+
481515audio_api = AudioAPI ()
482516
517+
483518@app .get ("/inputDevices" , response_model = list )
484519def get_input_devices ():
485520 try :
@@ -489,6 +524,7 @@ def get_input_devices():
489524 logger .error (f"Failed to get input devices: { e } " )
490525 raise HTTPException (status_code = 500 , detail = "Failed to get input devices" )
491526
527+
492528@app .get ("/outputDevices" , response_model = list )
493529def get_output_devices ():
494530 try :
@@ -498,14 +534,15 @@ def get_output_devices():
498534 logger .error (f"Failed to get output devices: { e } " )
499535 raise HTTPException (status_code = 500 , detail = "Failed to get output devices" )
500536
537+
501538@app .post ("/config" )
502539def configure_audio (config_data : ConfigData ):
503540 try :
504541 logger .info (f"Configuring audio with data: { config_data } " )
505542 if audio_api .set_values (config_data ):
506543 settings = config_data .dict ()
507544 settings ["use_jit" ] = False
508- with open ("configs/config.json" , "w" , encoding = ' utf-8' ) as j :
545+ with open ("configs/config.json" , "w" , encoding = " utf-8" ) as j :
509546 json .dump (settings , j , ensure_ascii = False )
510547 logger .info ("Configuration set successfully" )
511548 return {"message" : "Configuration set successfully" }
@@ -516,6 +553,7 @@ def configure_audio(config_data: ConfigData):
516553 logger .error (f"Configuration failed: { e } " )
517554 raise HTTPException (status_code = 400 , detail = f"Configuration failed: { e } " )
518555
556+
519557@app .post ("/start" )
520558def start_conversion ():
521559 try :
@@ -524,14 +562,17 @@ def start_conversion():
524562 return {"message" : "Audio conversion started" }
525563 else :
526564 logger .warning ("Audio conversion already running" )
527- raise HTTPException (status_code = 400 , detail = "Audio conversion already running" )
565+ raise HTTPException (
566+ status_code = 400 , detail = "Audio conversion already running"
567+ )
528568 except HTTPException as e :
529569 logger .error (f"Start conversion error: { e .detail } " )
530570 raise
531571 except Exception as e :
532572 logger .error (f"Failed to start conversion: { e } " )
533573 raise HTTPException (status_code = 500 , detail = "Failed to start conversion: {e}" )
534574
575+
535576@app .post ("/stop" )
536577def stop_conversion ():
537578 try :
@@ -550,6 +591,7 @@ def stop_conversion():
550591 logger .error (f"Failed to stop conversion: { e } " )
551592 raise HTTPException (status_code = 500 , detail = "Failed to stop conversion: {e}" )
552593
594+
553595if __name__ == "__main__" :
554596 if sys .platform == "win32" :
555597 freeze_support ()
@@ -560,6 +602,7 @@ def stop_conversion():
560602 from tools .torchgate import TorchGate
561603 import tools .rvc_for_realtime as rvc_for_realtime
562604 from configs .config import Config
605+
563606 audio_api .config = Config ()
564607 audio_api .initialize_queues ()
565608 uvicorn .run (app , host = "0.0.0.0" , port = 6242 )
0 commit comments