7
7
import noisereduce as nr
8
8
import numpy as np
9
9
from fairseq import checkpoint_utils
10
- import librosa , torch , parselmouth , faiss , time , threading
10
+ import librosa , torch , pyworld , faiss , time , threading
11
11
import torch .nn .functional as F
12
12
import torchaudio .transforms as tat
13
+ import scipy .signal as signal
13
14
14
15
# import matplotlib.pyplot as plt
15
16
from infer_pack .models import SynthesizerTrnMs256NSFsid , SynthesizerTrnMs256NSFsid_nono
@@ -26,71 +27,82 @@ def __init__(
26
27
"""
27
28
初始化
28
29
"""
29
- self .f0_up_key = key
30
- self .time_step = 160 / 16000 * 1000
31
- self .f0_min = 50
32
- self .f0_max = 1100
33
- self .f0_mel_min = 1127 * np .log (1 + self .f0_min / 700 )
34
- self .f0_mel_max = 1127 * np .log (1 + self .f0_max / 700 )
35
- if index_rate != 0 :
36
- self .index = faiss .read_index (index_path )
37
- self .big_npy = np .load (npy_path )
38
- print ("index search enabled" )
39
- self .index_rate = index_rate
40
- model_path = hubert_path
41
- print ("load model(s) from {}" .format (model_path ))
42
- models , saved_cfg , task = checkpoint_utils .load_model_ensemble_and_task (
43
- [model_path ],
44
- suffix = "" ,
30
+ try :
31
+ self .f0_up_key = key
32
+ self .time_step = 160 / 16000 * 1000
33
+ self .f0_min = 50
34
+ self .f0_max = 1100
35
+ self .f0_mel_min = 1127 * np .log (1 + self .f0_min / 700 )
36
+ self .f0_mel_max = 1127 * np .log (1 + self .f0_max / 700 )
37
+ self .sr = 16000
38
+ self .window = 160
39
+ if index_rate != 0 :
40
+ self .index = faiss .read_index (index_path )
41
+ self .big_npy = np .load (npy_path )
42
+ print ("index search enabled" )
43
+ self .index_rate = index_rate
44
+ model_path = hubert_path
45
+ print ("load model(s) from {}" .format (model_path ))
46
+ models , saved_cfg , task = checkpoint_utils .load_model_ensemble_and_task (
47
+ [model_path ],
48
+ suffix = "" ,
49
+ )
50
+ self .model = models [0 ]
51
+ self .model = self .model .to (device )
52
+ self .model = self .model .half ()
53
+ self .model .eval ()
54
+ cpt = torch .load (pth_path , map_location = "cpu" )
55
+ tgt_sr = cpt ["config" ][- 1 ]
56
+ cpt ["config" ][- 3 ] = cpt ["weight" ]["emb_g.weight" ].shape [0 ] # n_spk
57
+ if_f0 = cpt .get ("f0" , 1 )
58
+ if if_f0 == 1 :
59
+ self .net_g = SynthesizerTrnMs256NSFsid (* cpt ["config" ], is_half = True )
60
+ else :
61
+ self .net_g = SynthesizerTrnMs256NSFsid_nono (* cpt ["config" ])
62
+ del self .net_g .enc_q
63
+ print (self .net_g .load_state_dict (cpt ["weight" ], strict = False ))
64
+ self .net_g .eval ().to (device )
65
+ self .net_g .half ()
66
+ except Exception as e :
67
+ print (e )
68
+
69
+ def get_f0 (self , x , f0_up_key , inp_f0 = None ):
70
+ x_pad = 1
71
+ f0_min = 50
72
+ f0_max = 1100
73
+ f0_mel_min = 1127 * np .log (1 + f0_min / 700 )
74
+ f0_mel_max = 1127 * np .log (1 + f0_max / 700 )
75
+ f0 , t = pyworld .harvest (
76
+ x .astype (np .double ),
77
+ fs = self .sr ,
78
+ f0_ceil = f0_max ,
79
+ f0_floor = f0_min ,
80
+ frame_period = 10 ,
45
81
)
46
- self .model = models [0 ]
47
- self .model = self .model .to (device )
48
- self .model = self .model .half ()
49
- self .model .eval ()
50
- cpt = torch .load (pth_path , map_location = "cpu" )
51
- tgt_sr = cpt ["config" ][- 1 ]
52
- cpt ["config" ][- 3 ] = cpt ["weight" ]["emb_g.weight" ].shape [0 ] # n_spk
53
- if_f0 = cpt .get ("f0" , 1 )
54
- if if_f0 == 1 :
55
- self .net_g = SynthesizerTrnMs256NSFsid (* cpt ["config" ], is_half = True )
56
- else :
57
- self .net_g = SynthesizerTrnMs256NSFsid_nono (* cpt ["config" ])
58
- del self .net_g .enc_q
59
- print (self .net_g .load_state_dict (cpt ["weight" ], strict = False ))
60
- self .net_g .eval ().to (device )
61
- self .net_g .half ()
62
-
63
- def get_f0_coarse (self , f0 ):
82
+ f0 = pyworld .stonemask (x .astype (np .double ), f0 , t , self .sr )
83
+ f0 = signal .medfilt (f0 , 3 )
84
+ f0 *= pow (2 , f0_up_key / 12 )
85
+ # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
86
+ tf0 = self .sr // self .window # 每秒f0点数
87
+ if inp_f0 is not None :
88
+ delta_t = np .round (
89
+ (inp_f0 [:, 0 ].max () - inp_f0 [:, 0 ].min ()) * tf0 + 1
90
+ ).astype ("int16" )
91
+ replace_f0 = np .interp (
92
+ list (range (delta_t )), inp_f0 [:, 0 ] * 100 , inp_f0 [:, 1 ]
93
+ )
94
+ shape = f0 [x_pad * tf0 : x_pad * tf0 + len (replace_f0 )].shape [0 ]
95
+ f0 [x_pad * tf0 : x_pad * tf0 + len (replace_f0 )] = replace_f0 [:shape ]
96
+ # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
97
+ f0bak = f0 .copy ()
64
98
f0_mel = 1127 * np .log (1 + f0 / 700 )
65
- f0_mel [f0_mel > 0 ] = (f0_mel [f0_mel > 0 ] - self . f0_mel_min ) * 254 / (
66
- self . f0_mel_max - self . f0_mel_min
99
+ f0_mel [f0_mel > 0 ] = (f0_mel [f0_mel > 0 ] - f0_mel_min ) * 254 / (
100
+ f0_mel_max - f0_mel_min
67
101
) + 1
68
102
f0_mel [f0_mel <= 1 ] = 1
69
103
f0_mel [f0_mel > 255 ] = 255
70
- # f0_mel[f0_mel > 188] = 188
71
104
f0_coarse = np .rint (f0_mel ).astype (np .int )
72
- return f0_coarse
73
-
74
- def get_f0 (self , x , p_len , f0_up_key = 0 ):
75
- f0 = (
76
- parselmouth .Sound (x , 16000 )
77
- .to_pitch_ac (
78
- time_step = self .time_step / 1000 ,
79
- voicing_threshold = 0.6 ,
80
- pitch_floor = self .f0_min ,
81
- pitch_ceiling = self .f0_max ,
82
- )
83
- .selected_array ["frequency" ]
84
- )
85
-
86
- pad_size = (p_len - len (f0 ) + 1 ) // 2
87
- if pad_size > 0 or p_len - len (f0 ) - pad_size > 0 :
88
- f0 = np .pad (f0 , [[pad_size , p_len - len (f0 ) - pad_size ]], mode = "constant" )
89
- f0 *= pow (2 , f0_up_key / 12 )
90
- # f0=suofang(f0)
91
- f0bak = f0 .copy ()
92
- f0_coarse = self .get_f0_coarse (f0 )
93
- return f0_coarse , f0bak
105
+ return f0_coarse , f0bak # 1-0
94
106
95
107
def infer (self , feats : torch .Tensor ) -> np .ndarray :
96
108
"""
@@ -127,7 +139,7 @@ def infer(self, feats: torch.Tensor) -> np.ndarray:
127
139
# p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
128
140
p_len = min (feats .shape [1 ], 12000 ) #
129
141
print (feats .shape )
130
- pitch , pitchf = self .get_f0 (audio , p_len , self .f0_up_key )
142
+ pitch , pitchf = self .get_f0 (audio , self .f0_up_key )
131
143
p_len = min (feats .shape [1 ], 12000 , pitch .shape [0 ]) # 太大了爆显存
132
144
torch .cuda .synchronize ()
133
145
# print(feats.shape,pitch.shape)
@@ -365,7 +377,7 @@ def start_vc(self):
365
377
self .config .pth_path ,
366
378
self .config .index_path ,
367
379
self .config .npy_path ,
368
- self .config .index_rate ,
380
+ self .config .index_rate
369
381
)
370
382
self .input_wav : np .ndarray = np .zeros (
371
383
self .extra_frame
@@ -487,8 +499,9 @@ def audio_callback(
487
499
else :
488
500
outdata [:] = self .output_wav [:].repeat (2 , 1 ).t ().cpu ().numpy ()
489
501
total_time = time .perf_counter () - start_time
490
- print ("infer time:" + str (total_time ))
491
502
self .window ["infer_time" ].update (int (total_time * 1000 ))
503
+ print ("infer time:" + str (total_time ))
504
+
492
505
493
506
def get_devices (self , update : bool = True ):
494
507
"""获取设备列表"""
0 commit comments