1- import numpy as np
2- import parselmouth
3- import torch
4- import pdb
1+ import numpy as np , parselmouth , torch , pdb
52from time import time as ttime
63import torch .nn .functional as F
74from config import x_pad , x_query , x_center , x_max
85import scipy .signal as signal
9- import pyworld
10- import os
11- import traceback
12- import faiss
6+ import pyworld , os , traceback , faiss
137from scipy import signal
148
159bh , ah = signal .butter (N = 5 , Wn = 48 , btype = "high" , fs = 16000 )
@@ -70,8 +64,8 @@ def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
7064 replace_f0 = np .interp (
7165 list (range (delta_t )), inp_f0 [:, 0 ] * 100 , inp_f0 [:, 1 ]
7266 )
73- shape = f0 [x_pad * tf0 : x_pad * tf0 + len (replace_f0 )].shape [0 ]
74- f0 [x_pad * tf0 : x_pad * tf0 + len (replace_f0 )] = replace_f0 [:shape ]
67+ shape = f0 [x_pad * tf0 : x_pad * tf0 + len (replace_f0 )].shape [0 ]
68+ f0 [x_pad * tf0 : x_pad * tf0 + len (replace_f0 )] = replace_f0 [:shape ]
7569 # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
7670 f0bak = f0 .copy ()
7771 f0_mel = 1127 * np .log (1 + f0 / 700 )
@@ -105,8 +99,7 @@ def vc(
10599 feats = feats .mean (- 1 )
106100 assert feats .dim () == 1 , feats .dim ()
107101 feats = feats .view (1 , - 1 )
108- padding_mask = torch .BoolTensor (
109- feats .shape ).to (self .device ).fill_ (False )
102+ padding_mask = torch .BoolTensor (feats .shape ).to (self .device ).fill_ (False )
110103
111104 inputs = {
112105 "source" : feats .to (self .device ),
@@ -126,17 +119,23 @@ def vc(
126119 npy = feats [0 ].cpu ().numpy ()
127120 if self .is_half :
128121 npy = npy .astype ("float32" )
129- _ , I = index .search (npy , 1 )
130- npy = big_npy [I .squeeze ()]
122+
123+ # _, I = index.search(npy, 1)
124+ # npy = big_npy[I.squeeze()]
125+
126+ score , ix = index .search (npy , k = 8 )
127+ weight = np .square (1 / score )
128+ weight /= weight .sum (axis = 1 , keepdims = True )
129+ npy = np .sum (big_npy [ix ] * np .expand_dims (weight , axis = 2 ), axis = 1 )
130+
131131 if self .is_half :
132132 npy = npy .astype ("float16" )
133133 feats = (
134134 torch .from_numpy (npy ).unsqueeze (0 ).to (self .device ) * index_rate
135135 + (1 - index_rate ) * feats
136136 )
137137
138- feats = F .interpolate (feats .permute (0 , 2 , 1 ),
139- scale_factor = 2 ).permute (0 , 2 , 1 )
138+ feats = F .interpolate (feats .permute (0 , 2 , 1 ), scale_factor = 2 ).permute (0 , 2 , 1 )
140139 t1 = ttime ()
141140 p_len = audio0 .shape [0 ] // self .window
142141 if feats .shape [1 ] < p_len :
@@ -148,8 +147,7 @@ def vc(
148147 with torch .no_grad ():
149148 if pitch != None and pitchf != None :
150149 audio1 = (
151- (net_g .infer (feats , p_len , pitch ,
152- pitchf , sid )[0 ][0 , 0 ] * 32768 )
150+ (net_g .infer (feats , p_len , pitch , pitchf , sid )[0 ][0 , 0 ] * 32768 )
153151 .data .cpu ()
154152 .float ()
155153 .numpy ()
@@ -181,41 +179,41 @@ def pipeline(
181179 f0_up_key ,
182180 f0_method ,
183181 file_index ,
184- file_big_npy ,
182+ # file_big_npy,
185183 index_rate ,
186184 if_f0 ,
187185 f0_file = None ,
188186 ):
189187 if (
190- file_big_npy != ""
191- and file_index != ""
192- and os .path .exists (file_big_npy ) == True
188+ file_index != ""
189+ # and file_big_npy != ""
190+ # and os.path.exists(file_big_npy) == True
193191 and os .path .exists (file_index ) == True
194192 and index_rate != 0
195193 ):
196194 try :
197195 index = faiss .read_index (file_index )
198- big_npy = np .load (file_big_npy )
196+ # big_npy = np.load(file_big_npy)
197+ big_npy = index .reconstruct_n (0 , index .ntotal )
199198 except :
200199 traceback .print_exc ()
201200 index = big_npy = None
202201 else :
203202 index = big_npy = None
204203 audio = signal .filtfilt (bh , ah , audio )
205- audio_pad = np .pad (
206- audio , (self .window // 2 , self .window // 2 ), mode = "reflect" )
204+ audio_pad = np .pad (audio , (self .window // 2 , self .window // 2 ), mode = "reflect" )
207205 opt_ts = []
208206 if audio_pad .shape [0 ] > self .t_max :
209207 audio_sum = np .zeros_like (audio )
210208 for i in range (self .window ):
211- audio_sum += audio_pad [i : i - self .window ]
209+ audio_sum += audio_pad [i : i - self .window ]
212210 for t in range (self .t_center , audio .shape [0 ], self .t_center ):
213211 opt_ts .append (
214212 t
215213 - self .t_query
216214 + np .where (
217- np .abs (audio_sum [t - self .t_query : t + self .t_query ])
218- == np .abs (audio_sum [t - self .t_query : t + self .t_query ]).min ()
215+ np .abs (audio_sum [t - self .t_query : t + self .t_query ])
216+ == np .abs (audio_sum [t - self .t_query : t + self .t_query ]).min ()
219217 )[0 ][0 ]
220218 )
221219 s = 0
@@ -238,13 +236,11 @@ def pipeline(
238236 sid = torch .tensor (sid , device = self .device ).unsqueeze (0 ).long ()
239237 pitch , pitchf = None , None
240238 if if_f0 == 1 :
241- pitch , pitchf = self .get_f0 (
242- audio_pad , p_len , f0_up_key , f0_method , inp_f0 )
239+ pitch , pitchf = self .get_f0 (audio_pad , p_len , f0_up_key , f0_method , inp_f0 )
243240 pitch = pitch [:p_len ]
244241 pitchf = pitchf [:p_len ]
245242 pitch = torch .tensor (pitch , device = self .device ).unsqueeze (0 ).long ()
246- pitchf = torch .tensor (
247- pitchf , device = self .device ).unsqueeze (0 ).float ()
243+ pitchf = torch .tensor (pitchf , device = self .device ).unsqueeze (0 ).float ()
248244 t2 = ttime ()
249245 times [1 ] += t2 - t1
250246 for t in opt_ts :
@@ -255,31 +251,29 @@ def pipeline(
255251 model ,
256252 net_g ,
257253 sid ,
258- audio_pad [s : t + self .t_pad2 + self .window ],
259- pitch [:, s //
260- self .window : (t + self .t_pad2 ) // self .window ],
261- pitchf [:, s //
262- self .window : (t + self .t_pad2 ) // self .window ],
254+ audio_pad [s : t + self .t_pad2 + self .window ],
255+ pitch [:, s // self .window : (t + self .t_pad2 ) // self .window ],
256+ pitchf [:, s // self .window : (t + self .t_pad2 ) // self .window ],
263257 times ,
264258 index ,
265259 big_npy ,
266260 index_rate ,
267- )[self .t_pad_tgt : - self .t_pad_tgt ]
261+ )[self .t_pad_tgt : - self .t_pad_tgt ]
268262 )
269263 else :
270264 audio_opt .append (
271265 self .vc (
272266 model ,
273267 net_g ,
274268 sid ,
275- audio_pad [s : t + self .t_pad2 + self .window ],
269+ audio_pad [s : t + self .t_pad2 + self .window ],
276270 None ,
277271 None ,
278272 times ,
279273 index ,
280274 big_npy ,
281275 index_rate ,
282- )[self .t_pad_tgt : - self .t_pad_tgt ]
276+ )[self .t_pad_tgt : - self .t_pad_tgt ]
283277 )
284278 s = t
285279 if if_f0 == 1 :
@@ -289,13 +283,13 @@ def pipeline(
289283 net_g ,
290284 sid ,
291285 audio_pad [t :],
292- pitch [:, t // self .window :] if t is not None else pitch ,
293- pitchf [:, t // self .window :] if t is not None else pitchf ,
286+ pitch [:, t // self .window :] if t is not None else pitch ,
287+ pitchf [:, t // self .window :] if t is not None else pitchf ,
294288 times ,
295289 index ,
296290 big_npy ,
297291 index_rate ,
298- )[self .t_pad_tgt : - self .t_pad_tgt ]
292+ )[self .t_pad_tgt : - self .t_pad_tgt ]
299293 )
300294 else :
301295 audio_opt .append (
@@ -310,7 +304,7 @@ def pipeline(
310304 index ,
311305 big_npy ,
312306 index_rate ,
313- )[self .t_pad_tgt : - self .t_pad_tgt ]
307+ )[self .t_pad_tgt : - self .t_pad_tgt ]
314308 )
315309 audio_opt = np .concatenate (audio_opt )
316310 del pitch , pitchf , sid
0 commit comments