14
14
import multiprocessing
15
15
16
16
logger = logging .getLogger (__name__ )
17
-
17
+ stream_latency = - 1
18
18
19
19
class Harvest (multiprocessing .Process ):
20
20
def __init__ (self , inp_q , opt_q ):
@@ -100,7 +100,8 @@ class GUI:
100
100
def __init__ (self ) -> None :
101
101
self .config = GUIConfig ()
102
102
self .flag_vc = False
103
-
103
+ self .function = 'vc'
104
+ self .delay_time = 0
104
105
self .launcher ()
105
106
106
107
def load (self ):
@@ -112,6 +113,10 @@ def load(self):
112
113
data ["harvest" ] = data ["f0method" ] == "harvest"
113
114
data ["crepe" ] = data ["f0method" ] == "crepe"
114
115
data ["rmvpe" ] = data ["f0method" ] == "rmvpe"
116
+ if data ["sg_input_device" ] not in input_devices :
117
+ data ["sg_input_device" ] = input_devices [sd .default .device [0 ]]
118
+ if data ["sg_output_device" ] not in output_devices :
119
+ data ["sg_output_device" ] = output_devices [sd .default .device [1 ]]
115
120
except :
116
121
with open ("configs/config.json" , "w" ) as j :
117
122
data = {
@@ -342,6 +347,22 @@ def launcher(self):
342
347
[
343
348
sg .Button (i18n ("开始音频转换" ), key = "start_vc" ),
344
349
sg .Button (i18n ("停止音频转换" ), key = "stop_vc" ),
350
+ sg .Radio (
351
+ i18n ("输入监听" ),
352
+ "function" ,
353
+ key = "im" ,
354
+ default = False ,
355
+ enable_events = True ,
356
+ ),
357
+ sg .Radio (
358
+ i18n ("输出变声" ),
359
+ "function" ,
360
+ key = "vc" ,
361
+ default = True ,
362
+ enable_events = True ,
363
+ ),
364
+ sg .Text (i18n ("算法延迟(ms):" )),
365
+ sg .Text ("0" , key = "delay_time" ),
345
366
sg .Text (i18n ("推理时间(ms):" )),
346
367
sg .Text ("0" , key = "infer_time" ),
347
368
],
@@ -403,9 +424,16 @@ def event_handler(self):
403
424
}
404
425
with open ("configs/config.json" , "w" ) as j :
405
426
json .dump (settings , j )
427
+ global stream_latency
428
+ while stream_latency < 0 :
429
+ time .sleep (0.01 )
430
+ self .delay_time = stream_latency + values ["block_time" ] + values ["crossfade_length" ] + 0.01
431
+ if values ["I_noise_reduce" ]:
432
+ self .delay_time += values ["crossfade_length" ]
433
+ self .window ["delay_time" ].update (int (self .delay_time * 1000 ))
406
434
if event == "stop_vc" and self .flag_vc == True :
407
435
self .flag_vc = False
408
-
436
+ stream_latency = - 1
409
437
# Parameter hot update
410
438
if event == "threhold" :
411
439
self .config .threhold = values ["threhold" ]
@@ -423,11 +451,17 @@ def event_handler(self):
423
451
self .config .f0method = event
424
452
elif event == "I_noise_reduce" :
425
453
self .config .I_noise_reduce = values ["I_noise_reduce" ]
454
+ if stream_latency > 0 :
455
+ self .delay_time += (1 if values ["I_noise_reduce" ] else - 1 ) * values ["crossfade_length" ]
456
+ self .window ["delay_time" ].update (int (self .delay_time * 1000 ))
426
457
elif event == "O_noise_reduce" :
427
458
self .config .O_noise_reduce = values ["O_noise_reduce" ]
459
+ elif event in ["vc" , "im" ]:
460
+ self .function = event
428
461
elif event != "start_vc" and self .flag_vc == True :
429
462
# Other parameters do not support hot update
430
463
self .flag_vc = False
464
+ stream_latency = - 1
431
465
432
466
def set_values (self , values ):
433
467
if len (values ["pth_path" ].strip ()) == 0 :
@@ -565,7 +599,9 @@ def soundinput(self):
565
599
blocksize = self .block_frame ,
566
600
samplerate = self .config .samplerate ,
567
601
dtype = "float32" ,
568
- ):
602
+ ) as stream :
603
+ global stream_latency
604
+ stream_latency = stream .latency [- 1 ]
569
605
while self .flag_vc :
570
606
time .sleep (self .config .block_time )
571
607
logger .debug ("Audio block passed." )
@@ -597,7 +633,7 @@ def audio_callback(
597
633
self .block_frame_16k :
598
634
].clone ()
599
635
# input noise reduction and resampling
600
- if self .config .I_noise_reduce :
636
+ if self .config .I_noise_reduce and self . function == 'vc' :
601
637
input_wav = self .input_wav [
602
638
- self .crossfade_frame - self .block_frame - 2 * self .zc :
603
639
]
@@ -621,23 +657,28 @@ def audio_callback(
621
657
self .input_wav [- self .block_frame - 2 * self .zc :]
622
658
)[160 :]
623
659
# infer
624
- f0_extractor_frame = self .block_frame_16k + 800
625
- if self .config .f0method == "rmvpe" :
626
- f0_extractor_frame = 5120 * ((f0_extractor_frame - 1 ) // 5120 + 1 ) - 160
627
- infer_wav = self .rvc .infer (
628
- self .input_wav_res ,
629
- self .input_wav_res [- f0_extractor_frame :].cpu ().numpy (),
630
- self .block_frame_16k ,
631
- self .valid_rate ,
632
- self .pitch ,
633
- self .pitchf ,
634
- self .config .f0method ,
635
- )
636
- infer_wav = infer_wav [
637
- - self .crossfade_frame - self .sola_search_frame - self .block_frame :
638
- ]
660
+ if self .function == 'vc' :
661
+ f0_extractor_frame = self .block_frame_16k + 800
662
+ if self .config .f0method == "rmvpe" :
663
+ f0_extractor_frame = 5120 * ((f0_extractor_frame - 1 ) // 5120 + 1 ) - 160
664
+ infer_wav = self .rvc .infer (
665
+ self .input_wav_res ,
666
+ self .input_wav_res [- f0_extractor_frame :].cpu ().numpy (),
667
+ self .block_frame_16k ,
668
+ self .valid_rate ,
669
+ self .pitch ,
670
+ self .pitchf ,
671
+ self .config .f0method ,
672
+ )
673
+ infer_wav = infer_wav [
674
+ - self .crossfade_frame - self .sola_search_frame - self .block_frame :
675
+ ]
676
+ else :
677
+ infer_wav = self .input_wav [
678
+ - self .crossfade_frame - self .sola_search_frame - self .block_frame :
679
+ ].clone ()
639
680
# output noise reduction
640
- if self .config .O_noise_reduce :
681
+ if ( self .config .O_noise_reduce and self . function == 'vc' ) or ( self . config . I_noise_reduce and self . function == 'im' ) :
641
682
self .output_buffer [: - self .block_frame ] = self .output_buffer [
642
683
self .block_frame :
643
684
].clone ()
@@ -646,7 +687,7 @@ def audio_callback(
646
687
infer_wav .unsqueeze (0 ), self .output_buffer .unsqueeze (0 )
647
688
).squeeze (0 )
648
689
# volume envelop mixing
649
- if self .config .rms_mix_rate < 1 :
690
+ if self .config .rms_mix_rate < 1 and self . function == 'vc' :
650
691
rms1 = librosa .feature .rms (
651
692
y = self .input_wav_res [- 160 * infer_wav .shape [0 ] // self .zc :]
652
693
.cpu ()
0 commit comments