@@ -478,15 +478,28 @@ def start_vc(self):
478
478
inp_q ,
479
479
opt_q ,
480
480
device ,
481
- self .rvc if hasattr (self , "rvc" ) else None
481
+ self .rvc if hasattr (self , "rvc" ) else None ,
482
482
)
483
483
self .config .samplerate = self .rvc .tgt_sr
484
484
self .zc = self .rvc .tgt_sr // 100
485
- self .block_frame = int (np .round (self .config .block_time * self .config .samplerate / self .zc )) * self .zc
485
+ self .block_frame = (
486
+ int (np .round (self .config .block_time * self .config .samplerate / self .zc ))
487
+ * self .zc
488
+ )
486
489
self .block_frame_16k = 160 * self .block_frame // self .zc
487
- self .crossfade_frame = int (np .round (self .config .crossfade_time * self .config .samplerate / self .zc )) * self .zc
490
+ self .crossfade_frame = (
491
+ int (
492
+ np .round (
493
+ self .config .crossfade_time * self .config .samplerate / self .zc
494
+ )
495
+ )
496
+ * self .zc
497
+ )
488
498
self .sola_search_frame = self .zc
489
- self .extra_frame = int (np .round (self .config .extra_time * self .config .samplerate / self .zc )) * self .zc
499
+ self .extra_frame = (
500
+ int (np .round (self .config .extra_time * self .config .samplerate / self .zc ))
501
+ * self .zc
502
+ )
490
503
self .input_wav : torch .Tensor = torch .zeros (
491
504
self .extra_frame
492
505
+ self .crossfade_frame
@@ -495,7 +508,11 @@ def start_vc(self):
495
508
device = device ,
496
509
dtype = torch .float32 ,
497
510
)
498
- self .input_wav_res : torch .Tensor = torch .zeros (160 * self .input_wav .shape [0 ] // self .zc , device = device ,dtype = torch .float32 )
511
+ self .input_wav_res : torch .Tensor = torch .zeros (
512
+ 160 * self .input_wav .shape [0 ] // self .zc ,
513
+ device = device ,
514
+ dtype = torch .float32 ,
515
+ )
499
516
self .pitch : np .ndarray = np .zeros (
500
517
self .input_wav .shape [0 ] // self .zc ,
501
518
dtype = "int32" ,
@@ -509,7 +526,9 @@ def start_vc(self):
509
526
)
510
527
self .nr_buffer : torch .Tensor = self .sola_buffer .clone ()
511
528
self .output_buffer : torch .Tensor = self .input_wav .clone ()
512
- self .res_buffer : torch .Tensor = torch .zeros (2 * self .zc , device = device ,dtype = torch .float32 )
529
+ self .res_buffer : torch .Tensor = torch .zeros (
530
+ 2 * self .zc , device = device , dtype = torch .float32
531
+ )
513
532
self .valid_rate = 1 - (self .extra_frame - 1 ) / self .input_wav .shape [0 ]
514
533
self .fade_in_window : torch .Tensor = (
515
534
torch .sin (
@@ -529,7 +548,9 @@ def start_vc(self):
529
548
self .resampler = tat .Resample (
530
549
orig_freq = self .config .samplerate , new_freq = 16000 , dtype = torch .float32
531
550
).to (device )
532
- self .tg = TorchGate (sr = self .config .samplerate , n_fft = 4 * self .zc , prop_decrease = 0.9 ).to (device )
551
+ self .tg = TorchGate (
552
+ sr = self .config .samplerate , n_fft = 4 * self .zc , prop_decrease = 0.9
553
+ ).to (device )
533
554
thread_vc = threading .Thread (target = self .soundinput )
534
555
thread_vc .start ()
535
556
@@ -560,36 +581,52 @@ def audio_callback(
560
581
indata = librosa .to_mono (indata .T )
561
582
if self .config .threhold > - 60 :
562
583
rms = librosa .feature .rms (
563
- y = indata , frame_length = 4 * self .zc , hop_length = self .zc
584
+ y = indata , frame_length = 4 * self .zc , hop_length = self .zc
564
585
)
565
586
db_threhold = (
566
587
librosa .amplitude_to_db (rms , ref = 1.0 )[0 ] < self .config .threhold
567
588
)
568
589
for i in range (db_threhold .shape [0 ]):
569
590
if db_threhold [i ]:
570
591
indata [i * self .zc : (i + 1 ) * self .zc ] = 0
571
- self .input_wav [: - self .block_frame ] = self .input_wav [self .block_frame :].clone ()
572
- self .input_wav [- self .block_frame : ] = torch .from_numpy (indata ).to (device )
573
- self .input_wav_res [ : - self .block_frame_16k ] = self .input_wav_res [self .block_frame_16k :].clone ()
592
+ self .input_wav [: - self .block_frame ] = self .input_wav [
593
+ self .block_frame :
594
+ ].clone ()
595
+ self .input_wav [- self .block_frame :] = torch .from_numpy (indata ).to (device )
596
+ self .input_wav_res [: - self .block_frame_16k ] = self .input_wav_res [
597
+ self .block_frame_16k :
598
+ ].clone ()
574
599
# input noise reduction and resampling
575
600
if self .config .I_noise_reduce :
576
- input_wav = self .input_wav [- self .crossfade_frame - self .block_frame - 2 * self .zc : ]
577
- input_wav = self .tg (input_wav .unsqueeze (0 ), self .input_wav .unsqueeze (0 ))[0 , 2 * self .zc :]
601
+ input_wav = self .input_wav [
602
+ - self .crossfade_frame - self .block_frame - 2 * self .zc :
603
+ ]
604
+ input_wav = self .tg (
605
+ input_wav .unsqueeze (0 ), self .input_wav .unsqueeze (0 )
606
+ )[0 , 2 * self .zc :]
578
607
input_wav [: self .crossfade_frame ] *= self .fade_in_window
579
- input_wav [: self .crossfade_frame ] += self .nr_buffer * self .fade_out_window
580
- self .nr_buffer [:] = input_wav [- self .crossfade_frame : ]
581
- input_wav = torch .cat ((self .res_buffer [:], input_wav [: self .block_frame ]))
582
- self .res_buffer [:] = input_wav [- 2 * self .zc : ]
583
- self .input_wav_res [- self .block_frame_16k - 160 : ] = self .resampler (input_wav )[160 : ]
608
+ input_wav [: self .crossfade_frame ] += (
609
+ self .nr_buffer * self .fade_out_window
610
+ )
611
+ self .nr_buffer [:] = input_wav [- self .crossfade_frame :]
612
+ input_wav = torch .cat (
613
+ (self .res_buffer [:], input_wav [: self .block_frame ])
614
+ )
615
+ self .res_buffer [:] = input_wav [- 2 * self .zc :]
616
+ self .input_wav_res [- self .block_frame_16k - 160 :] = self .resampler (
617
+ input_wav
618
+ )[160 :]
584
619
else :
585
- self .input_wav_res [- self .block_frame_16k - 160 : ] = self .resampler (self .input_wav [- self .block_frame - 2 * self .zc : ])[160 : ]
620
+ self .input_wav_res [- self .block_frame_16k - 160 :] = self .resampler (
621
+ self .input_wav [- self .block_frame - 2 * self .zc :]
622
+ )[160 :]
586
623
# infer
587
624
f0_extractor_frame = self .block_frame_16k + 800
588
- if self .config .f0method == ' rmvpe' :
625
+ if self .config .f0method == " rmvpe" :
589
626
f0_extractor_frame = 5120 * ((f0_extractor_frame - 1 ) // 5120 + 1 )
590
627
infer_wav = self .rvc .infer (
591
628
self .input_wav_res ,
592
- self .input_wav_res [- f0_extractor_frame :].cpu ().numpy (),
629
+ self .input_wav_res [- f0_extractor_frame :].cpu ().numpy (),
593
630
self .block_frame_16k ,
594
631
self .valid_rate ,
595
632
self .pitch ,
@@ -601,48 +638,77 @@ def audio_callback(
601
638
]
602
639
# output noise reduction
603
640
if self .config .O_noise_reduce :
604
- self .output_buffer [: - self .block_frame ] = self .output_buffer [self .block_frame :].clone ()
605
- self .output_buffer [- self .block_frame : ] = infer_wav [- self .block_frame :]
606
- infer_wav = self .tg (infer_wav .unsqueeze (0 ), self .output_buffer .unsqueeze (0 )).squeeze (0 )
641
+ self .output_buffer [: - self .block_frame ] = self .output_buffer [
642
+ self .block_frame :
643
+ ].clone ()
644
+ self .output_buffer [- self .block_frame :] = infer_wav [- self .block_frame :]
645
+ infer_wav = self .tg (
646
+ infer_wav .unsqueeze (0 ), self .output_buffer .unsqueeze (0 )
647
+ ).squeeze (0 )
607
648
# volume envelop mixing
608
649
if self .config .rms_mix_rate < 1 :
609
650
rms1 = librosa .feature .rms (
610
- y = self .input_wav_res [- 160 * infer_wav .shape [0 ]// self .zc :].cpu ().numpy (),
611
- frame_length = 640 ,
612
- hop_length = 160 ,
651
+ y = self .input_wav_res [- 160 * infer_wav .shape [0 ] // self .zc :]
652
+ .cpu ()
653
+ .numpy (),
654
+ frame_length = 640 ,
655
+ hop_length = 160 ,
613
656
)
614
657
rms1 = torch .from_numpy (rms1 ).to (device )
615
658
rms1 = F .interpolate (
616
- rms1 .unsqueeze (0 ), size = infer_wav .shape [0 ] + 1 , mode = "linear" ,align_corners = True ,
617
- )[0 ,0 ,:- 1 ]
659
+ rms1 .unsqueeze (0 ),
660
+ size = infer_wav .shape [0 ] + 1 ,
661
+ mode = "linear" ,
662
+ align_corners = True ,
663
+ )[0 , 0 , :- 1 ]
618
664
rms2 = librosa .feature .rms (
619
- y = infer_wav [:].cpu ().numpy (), frame_length = 4 * self .zc , hop_length = self .zc
665
+ y = infer_wav [:].cpu ().numpy (),
666
+ frame_length = 4 * self .zc ,
667
+ hop_length = self .zc ,
620
668
)
621
669
rms2 = torch .from_numpy (rms2 ).to (device )
622
670
rms2 = F .interpolate (
623
- rms2 .unsqueeze (0 ), size = infer_wav .shape [0 ] + 1 , mode = "linear" ,align_corners = True ,
624
- )[0 ,0 ,:- 1 ]
671
+ rms2 .unsqueeze (0 ),
672
+ size = infer_wav .shape [0 ] + 1 ,
673
+ mode = "linear" ,
674
+ align_corners = True ,
675
+ )[0 , 0 , :- 1 ]
625
676
rms2 = torch .max (rms2 , torch .zeros_like (rms2 ) + 1e-3 )
626
- infer_wav *= torch .pow (rms1 / rms2 , torch .tensor (1 - self .config .rms_mix_rate ))
677
+ infer_wav *= torch .pow (
678
+ rms1 / rms2 , torch .tensor (1 - self .config .rms_mix_rate )
679
+ )
627
680
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
628
- conv_input = infer_wav [None , None , : self .crossfade_frame + self .sola_search_frame ]
681
+ conv_input = infer_wav [
682
+ None , None , : self .crossfade_frame + self .sola_search_frame
683
+ ]
629
684
cor_nom = F .conv1d (conv_input , self .sola_buffer [None , None , :])
630
685
cor_den = torch .sqrt (
631
- F .conv1d (conv_input ** 2 , torch .ones (1 , 1 , self .crossfade_frame , device = device )) + 1e-8 )
686
+ F .conv1d (
687
+ conv_input ** 2 ,
688
+ torch .ones (1 , 1 , self .crossfade_frame , device = device ),
689
+ )
690
+ + 1e-8
691
+ )
632
692
if sys .platform == "darwin" :
633
693
_ , sola_offset = torch .max (cor_nom [0 , 0 ] / cor_den [0 , 0 ])
634
694
sola_offset = sola_offset .item ()
635
695
else :
636
696
sola_offset = torch .argmax (cor_nom [0 , 0 ] / cor_den [0 , 0 ])
637
697
logger .debug ("sola_offset = %d" , int (sola_offset ))
638
- infer_wav = infer_wav [sola_offset : sola_offset + self .block_frame + self .crossfade_frame ]
698
+ infer_wav = infer_wav [
699
+ sola_offset : sola_offset + self .block_frame + self .crossfade_frame
700
+ ]
639
701
infer_wav [: self .crossfade_frame ] *= self .fade_in_window
640
- infer_wav [: self .crossfade_frame ] += self .sola_buffer * self .fade_out_window
641
- self .sola_buffer [:] = infer_wav [- self .crossfade_frame :]
702
+ infer_wav [: self .crossfade_frame ] += self .sola_buffer * self .fade_out_window
703
+ self .sola_buffer [:] = infer_wav [- self .crossfade_frame :]
642
704
if sys .platform == "darwin" :
643
- outdata [:] = infer_wav [:- self .crossfade_frame ].cpu ().numpy ()[:, np .newaxis ]
705
+ outdata [:] = (
706
+ infer_wav [: - self .crossfade_frame ].cpu ().numpy ()[:, np .newaxis ]
707
+ )
644
708
else :
645
- outdata [:] = infer_wav [:- self .crossfade_frame ].repeat (2 , 1 ).t ().cpu ().numpy ()
709
+ outdata [:] = (
710
+ infer_wav [: - self .crossfade_frame ].repeat (2 , 1 ).t ().cpu ().numpy ()
711
+ )
646
712
total_time = time .perf_counter () - start_time
647
713
self .window ["infer_time" ].update (int (total_time * 1000 ))
648
714
logger .info ("Infer time: %.2f" , total_time )
@@ -698,9 +764,7 @@ def set_devices(self, input_device, output_device):
698
764
sd .default .device [1 ] = output_device_indices [
699
765
output_devices .index (output_device )
700
766
]
701
- logger .info (
702
- "Input device: %s:%s" , str (sd .default .device [0 ]), input_device
703
- )
767
+ logger .info ("Input device: %s:%s" , str (sd .default .device [0 ]), input_device )
704
768
logger .info (
705
769
"Output device: %s:%s" , str (sd .default .device [1 ]), output_device
706
770
)
0 commit comments