RVC-Project
diff --git a/‎config.py
Lines changed: 38 additions & 0 deletions b/‎config.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎extract_f0_print.py
Lines changed: 120 additions & 0 deletions b/‎extract_f0_print.py
Lines changed: 120 additions & 0 deletions
diff --git a/‎extract_feature_print.py
Lines changed: 84 additions & 0 deletions b/‎extract_feature_print.py
Lines changed: 84 additions & 0 deletions
@@ -0,0 +1,38 @@
+############离线VC参数
+inp_root=r"白鹭霜华长条"#对输入目录下所有音频进行转换，别放非音频文件
+opt_root=r"opt"#输出目录
+f0_up_key=0#升降调，整数，男转女12，女转男-12
+person=r"weights\洛天依v3.pt"#目前只有洛天依v3
+############硬件参数
+device = "cuda:0"#填写cuda:x或cpu，x指代第几张卡，只支持N卡加速
+is_half=True#9-10-20-30-40系显卡无脑True，不影响质量，>=20显卡开启有加速
+n_cpu=0#默认0用上所有线程，写数字限制CPU资源使用
+############下头别动
+import torch
+if(torch.cuda.is_available()==False):
+    print("没有发现支持的N卡，使用CPU进行推理")
+    device="cpu"
+    is_half=False
+if(device!="cpu"):
+    gpu_name=torch.cuda.get_device_name(int(device.split(":")[-1]))
+    if("16"in gpu_name or "MX"in gpu_name):
+        print("16系显卡/MX系显卡强制单精度")
+        is_half=False
+from multiprocessing import cpu_count
+if(n_cpu==0):n_cpu=cpu_count()
+if(is_half==True):
+    #6G显存配置
+    x_pad       =   3
+    x_query     =   10
+    x_center    =   60
+    x_max       =   65
+else:
+    #5G显存配置
+    x_pad       =   1
+    # x_query     =   6
+    # x_center    =   30
+    # x_max       =   32
+    #6G显存配置
+    x_query     =   6
+    x_center    =   38
+    x_max       =   41
@@ -0,0 +1,120 @@
+import os,traceback,sys,parselmouth
+import librosa
+import pyworld
+from scipy.io import wavfile
+import numpy as np,logging
+logging.getLogger('numba').setLevel(logging.WARNING)
+from multiprocessing import Process
+
+exp_dir = sys.argv[1]
+f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
+def printt(strr):
+    print(strr)
+    f.write("%s\n" % strr)
+    f.flush()
+
+n_p = int(sys.argv[2])
+f0method = sys.argv[3]
+
+class FeatureInput(object):
+    def __init__(self, samplerate=16000, hop_size=160):
+        self.fs = samplerate
+        self.hop = hop_size
+
+        self.f0_bin = 256
+        self.f0_max = 1100.0
+        self.f0_min = 50.0
+        self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+
+    def compute_f0(self, path,f0_method):
+        x, sr = librosa.load(path, self.fs)
+        p_len=x.shape[0]//self.hop
+        assert sr == self.fs
+        if(f0_method=="pm"):
+            time_step = 160 / 16000 * 1000
+            f0_min = 50
+            f0_max = 1100
+            f0 = parselmouth.Sound(x, sr).to_pitch_ac(
+                time_step=time_step / 1000, voicing_threshold=0.6,
+                pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
+            pad_size=(p_len - len(f0) + 1) // 2
+            if(pad_size>0 or p_len - len(f0) - pad_size>0):
+                f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
+        elif(f0_method=="harvest"):
+            f0, t = pyworld.harvest(
+                x.astype(np.double),
+                fs=sr,
+                f0_ceil=1100,
+                frame_period=1000 * self.hop / sr,
+            )
+            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+        elif(f0_method=="dio"):
+            f0, t = pyworld.dio(
+                x.astype(np.double),
+                fs=sr,
+                f0_ceil=1100,
+                frame_period=1000 * self.hop / sr,
+            )
+            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+        return f0
+
+    def coarse_f0(self, f0):
+        f0_mel = 1127 * np.log(1 + f0 / 700)
+        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * (
+            self.f0_bin - 2
+        ) / (self.f0_mel_max - self.f0_mel_min) + 1
+
+        # use 0 or 1
+        f0_mel[f0_mel <= 1] = 1
+        f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1
+        f0_coarse = np.rint(f0_mel).astype(np.int)
+        assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+            f0_coarse.max(),
+            f0_coarse.min(),
+        )
+        return f0_coarse
+
+    def go(self,paths,f0_method):
+        if (len(paths) == 0): printt("no-f0-todo")
+        else:
+            printt("todo-f0-%s"%len(paths))
+            n=max(len(paths)//5,1)#每个进程最多打印5条
+            for idx,(inp_path,opt_path1,opt_path2) in enumerate(paths):
+                try:
+                    if(idx%n==0):printt("f0ing,now-%s,all-%s,-%s"%(idx,len(paths),inp_path))
+                    if(os.path.exists(opt_path1+".npy")==True and os.path.exists(opt_path2+".npy")==True):continue
+                    featur_pit = self.compute_f0(inp_path,f0_method)
+                    np.save(opt_path2,featur_pit,allow_pickle=False,)#nsf
+                    coarse_pit = self.coarse_f0(featur_pit)
+                    np.save(opt_path1,coarse_pit,allow_pickle=False,)#ori
+                except:
+                    printt("f0fail-%s-%s-%s" % (idx, inp_path,traceback.format_exc()))
+
+if __name__=='__main__':
+    # exp_dir=r"E:\codes\py39\dataset\mi-test"
+    # n_p=16
+    # f = open("%s/log_extract_f0.log"%exp_dir, "w")
+    printt(sys.argv)
+    featureInput = FeatureInput()
+    paths=[]
+    inp_root= "%s/1_16k_wavs"%(exp_dir)
+    opt_root1="%s/2a_f0"%(exp_dir)
+    opt_root2="%s/2b-f0nsf"%(exp_dir)
+
+    os.makedirs(opt_root1,exist_ok=True)
+    os.makedirs(opt_root2,exist_ok=True)
+    for name in sorted(list(os.listdir(inp_root))):
+        inp_path="%s/%s"%(inp_root,name)
+        if ("spec" in inp_path): continue
+        opt_path1="%s/%s"%(opt_root1,name)
+        opt_path2="%s/%s"%(opt_root2,name)
+        paths.append([inp_path,opt_path1,opt_path2])
+
+    ps=[]
+    for i in range(n_p):
+        p=Process(target=featureInput.go,args=(paths[i::n_p],f0method,))
+        p.start()
+        ps.append(p)
+    for p in ps:
+        p.join()
@@ -0,0 +1,84 @@
+import os,sys,traceback
+n_part=int(sys.argv[1])
+i_part=int(sys.argv[2])
+i_gpu=sys.argv[3]
+exp_dir=sys.argv[4]
+os.environ["CUDA_VISIBLE_DEVICES"]=str(i_gpu)
+
+import torch
+import torch.nn.functional as F
+import soundfile as sf
+import numpy as np
+import joblib
+from fairseq import checkpoint_utils
+import pdb
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+f = open("%s/extract_f0_feature.log"%exp_dir, "a+")
+def printt(strr):
+    print(strr)
+    f.write("%s\n" % strr)
+    f.flush()
+printt(sys.argv)
+# model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/speech/pretrain/ContentVec_legacy500.pt"
+model_path = "hubert_base.pt"
+
+printt(exp_dir)
+wavPath = "%s/1_16k_wavs"%exp_dir
+outPath = "%s/3_feature256"%exp_dir
+os.makedirs(outPath,exist_ok=True)
+# wave must be 16k, hop_size=320
+def readwave(wav_path, normalize=False):
+    wav, sr = sf.read(wav_path)
+    assert sr == 16000
+    feats = torch.from_numpy(wav).float()
+    if feats.dim() == 2:  # double channels
+        feats = feats.mean(-1)
+    assert feats.dim() == 1, feats.dim()
+    if normalize:
+        with torch.no_grad():
+            feats = F.layer_norm(feats, feats.shape)
+    feats = feats.view(1, -1)
+    return feats
+# HuBERT model
+printt("load model(s) from {}".format(model_path))
+models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
+    [model_path],
+    suffix="",
+)
+model = models[0]
+model = model.to(device)
+model = model.half()
+model.eval()
+
+todo=sorted(list(os.listdir(wavPath)))[i_part::n_part]
+n = max(1,len(todo) // 10)  # 最多打印十条
+if(len(todo)==0):printt("no-feature-todo")
+else:
+    printt("all-feature-%s"%len(todo))
+    for idx,file in enumerate(todo):
+        try:
+            if file.endswith(".wav"):
+                wav_path = "%s/%s"%(wavPath,file)
+                out_path = "%s/%s"%(outPath,file.replace("wav","npy"))
+
+                if(os.path.exists(out_path)):continue
+
+                feats = readwave(wav_path, normalize=saved_cfg.task.normalize)
+                padding_mask = torch.BoolTensor(feats.shape).fill_(False)
+                inputs = {
+                    "source": feats.half().to(device),
+                    "padding_mask": padding_mask.to(device),
+                    "output_layer": 9,  # layer 9
+                }
+                with torch.no_grad():
+                    logits = model.extract_features(**inputs)
+                    feats = model.final_proj(logits[0])
+
+                feats = feats.squeeze(0).float().cpu().numpy()
+                # feats = np.repeat(feats, 2,0) # 20ms -> 10ms
+                np.save(out_path, feats, allow_pickle=False)
+                if (idx % n == 0):printt("now-%s,all-%s,%s,%s"%(len(todo),idx,file,feats.shape))
+        except:
+            printt(traceback.format_exc())
+    printt("all-feature-done")