diff --git a/infer/lib/audio.py b/infer/lib/audio.py index 3bc0bfb7..f55e47a6 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -1,11 +1,16 @@ from io import BufferedWriter, BytesIO from pathlib import Path -from typing import Dict, Tuple +from typing import Dict, Tuple, Optional, Union, List import os +import math +import wave import numpy as np +from numba import jit import av from av.audio.resampler import AudioResampler +from av.audio.frame import AudioFrame +import scipy.io.wavfile as wavfile video_format_dict: Dict[str, str] = { "m4a": "mp4", @@ -17,6 +22,29 @@ } +@jit(nopython=True) +def float_to_int16(audio: np.ndarray) -> np.ndarray: + am = int(math.ceil(float(np.abs(audio).max())) * 32768) + am = 32767 * 32768 // am + return np.multiply(audio, am).astype(np.int16) + +def float_np_array_to_wav_buf(wav: np.ndarray, sr: int, f32=False) -> BytesIO: + buf = BytesIO() + if f32: + wavfile.write(buf, sr, wav.astype(np.float32)) + else: + with wave.open(buf, "wb") as wf: + wf.setnchannels(2 if len(wav.shape) > 1 else 1) + wf.setsampwidth(2) # Sample width in bytes + wf.setframerate(sr) # Sample rate in Hz + wf.writeframes(float_to_int16(wav.T if len(wav.shape) > 1 else wav)) + buf.seek(0, 0) + return buf + +def save_audio(path: str, audio: np.ndarray, sr: int, f32=False): + with open(path, "wb") as f: + f.write(float_np_array_to_wav_buf(audio, sr, f32).getbuffer()) + def wav2(i: BytesIO, o: BufferedWriter, format: str): inp = av.open(i, "r") format = video_format_dict.get(format, format) @@ -36,43 +64,72 @@ def wav2(i: BytesIO, o: BufferedWriter, format: str): inp.close() -def load_audio(file: str, sr: int) -> np.ndarray: - if not Path(file).exists(): +def load_audio( + file: Union[str, BytesIO, Path], + sr: Optional[int]=None, + format: Optional[str]=None, + mono=True + ) -> Union[np.ndarray, Tuple[np.ndarray, int]]: + if (isinstance(file, str) and not Path(file).exists()) or (isinstance(file, Path) and not file.exists()): raise FileNotFoundError(f"File not found: {file}") + rate = 0 + + container = av.open(file, format=format) + audio_stream = next(s for s in container.streams if s.type == "audio") + channels = 1 if audio_stream.layout == "mono" else 2 + container.seek(0) + resampler = AudioResampler(format="fltp", layout=audio_stream.layout, rate=sr) if sr is not None else None + + # Estimated maximum total number of samples to pre-allocate the array + # AV stores length in microseconds by default + estimated_total_samples = int(container.duration * sr // 1_000_000) if sr is not None else 48000 + decoded_audio = np.zeros(estimated_total_samples + 1 if channels == 1 else (channels, estimated_total_samples + 1), dtype=np.float32) + + offset = 0 + + def process_packet(packet: List[AudioFrame]): + frames_data = [] + rate = 0 + for frame in packet: + frame.pts = None # 清除时间戳,避免重新采样问题 + resampled_frames = resampler.resample(frame) if resampler is not None else [frame] + for resampled_frame in resampled_frames: + frame_data = resampled_frame.to_ndarray() + rate = resampled_frame.rate + frames_data.append(frame_data) + return (rate, frames_data) - try: - container = av.open(file) - resampler = AudioResampler(format="fltp", layout="mono", rate=sr) + def frame_iter(container): + for p in container.demux(container.streams.audio[0]): + yield p.decode() - # Estimated maximum total number of samples to pre-allocate the array - # AV stores length in microseconds by default - estimated_total_samples = int(container.duration * sr // 1_000_000) - decoded_audio = np.zeros(estimated_total_samples + 1, dtype=np.float32) + for r, frames_data in map(process_packet, frame_iter(container)): + if not rate: rate = r + for frame_data in frames_data: + end_index = offset + len(frame_data[0]) - offset = 0 - for frame in container.decode(audio=0): - frame.pts = None # Clear presentation timestamp to avoid resampling issues - resampled_frames = resampler.resample(frame) - for resampled_frame in resampled_frames: - frame_data = resampled_frame.to_ndarray()[0] - end_index = offset + len(frame_data) + # 检查 decoded_audio 是否有足够的空间,并在必要时调整大小 + if end_index > decoded_audio.shape[1]: + decoded_audio = np.resize(decoded_audio, (decoded_audio.shape[0], end_index*4)) - # Check if decoded_audio has enough space, and resize if necessary - if end_index > decoded_audio.shape[0]: - decoded_audio = np.resize(decoded_audio, end_index + 1) + np.copyto(decoded_audio[..., offset:end_index], frame_data) + offset += len(frame_data[0]) - decoded_audio[offset:end_index] = frame_data - offset += len(frame_data) + # Truncate the array to the actual size + decoded_audio = decoded_audio[..., :offset] - # Truncate the array to the actual size - decoded_audio = decoded_audio[:offset] - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") + if mono and decoded_audio.shape[0] > 1: + decoded_audio = decoded_audio.mean(0) - return decoded_audio + if sr is not None: + return decoded_audio + return decoded_audio, rate -def downsample_audio(input_path: str, output_path: str, format: str) -> None: +def downsample_audio(input_path: str, output_path: str, format: str, br=128_000) -> None: + """ + default to 128kb/s (equivalent to -q:a 2) + """ if not os.path.exists(input_path): return @@ -83,7 +140,7 @@ def downsample_audio(input_path: str, output_path: str, format: str) -> None: input_stream = input_container.streams.audio[0] output_stream = output_container.add_stream(format) - output_stream.bit_rate = 128_000 # 128kb/s (equivalent to -q:a 2) + output_stream.bit_rate = br # Copy packets from the input file to the output file for packet in input_container.demux(input_stream): @@ -141,7 +198,7 @@ def resample_audio( print(f"Failed to remove the original file: {e}") -def get_audio_properties(input_path: str) -> Tuple: +def get_audio_properties(input_path: str) -> Tuple[int, int]: container = av.open(input_path) audio_stream = next(s for s in container.streams if s.type == "audio") channels = 1 if audio_stream.layout == "mono" else 2 diff --git a/infer/lib/slicer2.py b/infer/lib/slicer2.py index 7d9d16db..ba751cd6 100644 --- a/infer/lib/slicer2.py +++ b/infer/lib/slicer2.py @@ -183,8 +183,7 @@ def main(): import os.path from argparse import ArgumentParser - import librosa - import soundfile + from .audio import load_audio, save_audio parser = ArgumentParser() parser.add_argument("audio", type=str, help="The audio to be sliced") @@ -230,7 +229,7 @@ def main(): out = args.out if out is None: out = os.path.dirname(os.path.abspath(args.audio)) - audio, sr = librosa.load(args.audio, sr=None, mono=False) + audio, sr = load_audio(args.audio, mono=False) slicer = Slicer( sr=sr, threshold=args.db_thresh, @@ -245,15 +244,11 @@ def main(): for i, chunk in enumerate(chunks): if len(chunk.shape) > 1: chunk = chunk.T - soundfile.write( - os.path.join( - out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), - chunk, - sr, - ) + save_audio(os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), chunk, sr) if __name__ == "__main__": diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py index 5d8b5559..957576f7 100644 --- a/infer/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -16,62 +16,12 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logger = logging -""" -def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - ################## - def go(model, bkey): - saved_state_dict = checkpoint_dict[bkey] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - logger.warning( - "shape-%s-mismatch. need: %s, get: %s", - k, - state_dict[k].shape, - saved_state_dict[k].shape, - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint", k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - return model - - go(combd, "combd") - model = go(sbd, "sbd") - ############# - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration -""" - def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): assert os.path.isfile(checkpoint_path) - saved_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) + + saved_state_dict = checkpoint_dict["model"] if hasattr(model, "module"): state_dict = model.module.state_dict() else: @@ -132,34 +82,6 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) ) -""" -def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(combd, "module"): - state_dict_combd = combd.module.state_dict() - else: - state_dict_combd = combd.state_dict() - if hasattr(sbd, "module"): - state_dict_sbd = sbd.module.state_dict() - else: - state_dict_sbd = sbd.state_dict() - torch.save( - { - "combd": state_dict_combd, - "sbd": state_dict_sbd, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) -""" - - def summarize( writer, global_step, @@ -366,53 +288,6 @@ def get_hparams(init=True): return hparams -""" -def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = model_dir - return hparams - - -def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - return hparams - - -def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warning( - "{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - ) - ) - return - - cur_hash = subprocess.getoutput("git rev-parse HEAD") - - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warning( - "git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8] - ) - ) - else: - open(path, "w").write(cur_hash) -""" - - def get_logger(model_dir, filename="train.log"): global logger logger = logging.getLogger(os.path.basename(model_dir)) diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py index 96a69dee..3b599741 100644 --- a/infer/modules/train/extract_feature_print.py +++ b/infer/modules/train/extract_feature_print.py @@ -2,6 +2,11 @@ import sys import traceback +now_dir = os.getcwd() +sys.path.append(now_dir) + +from infer.lib.audio import load_audio + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" @@ -20,7 +25,6 @@ is_half = sys.argv[7].lower() == "true" import fairseq import numpy as np -import soundfile as sf import torch import torch.nn.functional as F @@ -64,11 +68,9 @@ def printt(strr): # wave must be 16k, hop_size=320 def readwave(wav_path, normalize=False): - wav, sr = sf.read(wav_path) + wav, sr = load_audio(wav_path) assert sr == 16000 feats = torch.from_numpy(wav).float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() if normalize: with torch.no_grad(): diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py index 138bb4c6..7fa374ec 100644 --- a/infer/modules/train/preprocess.py +++ b/infer/modules/train/preprocess.py @@ -16,11 +16,9 @@ import os import traceback -import librosa import numpy as np -from scipy.io import wavfile -from infer.lib.audio import load_audio +from infer.lib.audio import load_audio, float_np_array_to_wav_buf, save_audio from infer.lib.slicer2 import Slicer f = open("%s/preprocess.log" % exp_dir, "a+") @@ -64,19 +62,15 @@ def norm_write(self, tmp_audio, idx0, idx1): tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( 1 - self.alpha ) * tmp_audio - wavfile.write( - "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), - self.sr, - tmp_audio.astype(np.float32), - ) - tmp_audio = librosa.resample( - tmp_audio, orig_sr=self.sr, target_sr=16000 - ) # , res_type="soxr_vhq" - wavfile.write( - "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), - 16000, - tmp_audio.astype(np.float32), - ) + save_audio("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), tmp_audio, self.sr, f32=True) + with open("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "wb") as f: + f.write(float_np_array_to_wav_buf( + load_audio( + float_np_array_to_wav_buf(tmp_audio, self.sr, f32=True), + sr=16000, + format="wav", + ) + , 16000, True).getbuffer()) def pipeline(self, path, idx0): try: diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py index 31b41f85..cbd50198 100644 --- a/infer/modules/uvr5/mdxnet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -5,12 +5,10 @@ import librosa import numpy as np -import soundfile as sf import torch from tqdm import tqdm -import av -from infer.lib.audio import downsample_audio +from infer.lib.audio import downsample_audio, save_audio cpu = torch.device("cpu") @@ -210,15 +208,13 @@ def prediction(self, m, vocal_root, others_root, format): sources = self.demix(mix.T) opt = sources[0].T if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) - sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) + save_audio("%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate) + save_audio("%s/instrument_%s.%s" % (others_root, basename, format), opt, rate) else: - path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) - path_other = "%s/%s_others.wav" % (others_root, basename) - sf.write(path_vocal, mix - opt, rate) - sf.write(path_other, opt, rate) + path_vocal = "%s/vocal_%s.wav" % (vocal_root, basename) + path_other = "%s/instrument_%s.wav" % (others_root, basename) + save_audio(path_vocal, opt, rate) + save_audio(path_other, opt, rate) opt_path_vocal = path_vocal[:-4] + ".%s" % format opt_path_other = path_other[:-4] + ".%s" % format downsample_audio(path_vocal, opt_path_vocal, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py index 8c6d1557..15ea1746 100644 --- a/infer/modules/uvr5/modules.py +++ b/infer/modules/uvr5/modules.py @@ -9,7 +9,7 @@ from configs import Config from infer.modules.uvr5.mdxnet import MDXNetDereverb -from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho +from infer.modules.uvr5.vr import AudioPre config = Config() @@ -27,8 +27,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format if model_name == "onnx_dereverb_By_FoxJoy": pre_fun = MDXNetDereverb(15, config.device) else: - func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho - pre_fun = func( + pre_fun = AudioPre( agg=int(agg), model_path=os.path.join( os.getenv("weight_uvr5_root"), model_name + ".pth" @@ -72,18 +71,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format infos.append("%s->Success" % (os.path.basename(inp_path))) yield "\n".join(infos) except: - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) + yield "\n".join(infos) except: infos.append(traceback.format_exc()) yield "\n".join(infos) diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py index e4c0f7d0..9156933d 100644 --- a/infer/modules/uvr5/vr.py +++ b/infer/modules/uvr5/vr.py @@ -5,8 +5,7 @@ import librosa import numpy as np -import soundfile as sf -from infer.lib.audio import downsample_audio +from infer.lib.audio import downsample_audio, save_audio import torch from infer.lib.uvr5_pack.lib_v5 import nets_123821KB as Nets @@ -20,6 +19,8 @@ class AudioPre: def __init__(self, agg, model_path, device, is_half, tta=False): self.model_path = model_path self.device = device + self.is_de_echo = "DeEcho" in model_path + self.is_reverse = self.is_de_echo or "HP3" in model_path self.data = { # Processing Options "postprocess": False, @@ -29,8 +30,13 @@ def __init__(self, agg, model_path, device, is_half, tta=False): "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") - model = Nets.CascadedASPPNet(mp.param["bins"] * 2) + if self.is_de_echo: + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") + nout = 64 if "DeReverb" in model_path else 48 + model = CascadedNet(mp.param["bins"] * 2, nout) + else: + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + model = Nets.CascadedASPPNet(mp.param["bins"] * 2) cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) model.eval() @@ -123,30 +129,28 @@ def _path_audio_( else: wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) logger.info("%s instruments done" % name) - head = "instrument_" + if self.is_reverse: + head = "vocal_" + else: + head = "instrument_" if format in ["wav", "flac"]: - sf.write( - os.path.join( + save_audio(os.path.join( ins_root, head + "{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # + ), wav_instrument, self.mp.param["sr"]) else: path = os.path.join( ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) + save_audio(path, wav_instrument, self.mp.param["sr"]) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format downsample_audio(path, opt_format_path, format) if vocal_root is not None: - head = "vocal_" + if self.is_reverse: + head = "instrument_" + else: + head = "vocal_" if self.data["high_end_process"].startswith("mirroring"): input_high_end_ = spec_utils.mirroring( self.data["high_end_process"], v_spec_m, input_high_end, self.mp @@ -158,185 +162,15 @@ def _path_audio_( wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) logger.info("%s vocals done" % name) if format in ["wav", "flac"]: - sf.write( - os.path.join( + save_audio(os.path.join( vocal_root, head + "{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) + ), wav_vocals, self.mp.param["sr"]) else: path = os.path.join( vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - opt_format_path = path[:-4] + ".%s" % format - downsample_audio(path, opt_format_path, format) - - -class AudioPreDeEcho: - def __init__(self, agg, model_path, device, is_half, tta=False): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": tta, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") - nout = 64 if "DeReverb" in model_path else 48 - model = CascadedNet(mp.param["bins"] * 2, nout) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac" - ): # 3个VR模型vocal和ins是反的 - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - sr=bp["sr"], - mono=False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.resample( - X_wave[d + 1], - orig_sr=self.mp.param["band"][d + 1]["sr"], - target_sr=bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - logger.info("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - opt_format_path = path[:-4] + ".%s" % format - downsample_audio(path, opt_format_path, format) - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - logger.info("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) + save_audio(path, wav_vocals, self.mp.param["sr"]) if os.path.exists(path): opt_format_path = path[:-4] + ".%s" % format downsample_audio(path, opt_format_path, format) diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 49f21dc8..3e526a04 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -5,11 +5,10 @@ logger = logging.getLogger(__name__) import numpy as np -import soundfile as sf import torch from io import BytesIO -from infer.lib.audio import load_audio, wav2 +from infer.lib.audio import load_audio, wav2, save_audio, float_np_array_to_wav_buf from rvc.synthesizer import get_synthesizer, load_synthesizer from .info import show_model_info from .pipeline import Pipeline @@ -253,23 +252,16 @@ def vc_multi( try: tgt_sr, audio_opt = opt if format1 in ["wav", "flac"]: - sf.write( - "%s/%s.%s" - % (opt_root, os.path.basename(path), format1), - audio_opt, - tgt_sr, - ) + save_audio("%s/%s.%s" + % (opt_root, os.path.basename(path), format1), audio_opt, tgt_sr) else: path = "%s/%s.%s" % ( opt_root, os.path.basename(path), format1, ) - with BytesIO() as wavf: - sf.write(wavf, audio_opt, tgt_sr, format="wav") - wavf.seek(0, 0) - with open(path, "wb") as outf: - wav2(wavf, outf, format1) + with open(path, "wb") as outf: + wav2(float_np_array_to_wav_buf(audio_opt, tgt_sr), outf, format1) except: info += traceback.format_exc() infos.append("%s->%s" % (os.path.basename(path), info)) diff --git a/requirements/amd.txt b/requirements/amd.txt index 81088f77..bd106f17 100644 --- a/requirements/amd.txt +++ b/requirements/amd.txt @@ -10,7 +10,6 @@ faiss-cpu==1.7.3 gradio Cython pydub>=0.25.1 -soundfile>=0.12.1 tensorboardX Jinja2>=3.1.2 json5 diff --git a/requirements/dml.txt b/requirements/dml.txt index 9b8595fa..fe76a8d6 100644 --- a/requirements/dml.txt +++ b/requirements/dml.txt @@ -9,7 +9,6 @@ faiss-cpu==1.7.3 gradio Cython pydub>=0.25.1 -soundfile>=0.12.1 tensorboardX Jinja2>=3.1.2 json5 diff --git a/requirements/gui-dml.txt b/requirements/gui-dml.txt index 6f7e11ea..db0dad05 100644 --- a/requirements/gui-dml.txt +++ b/requirements/gui-dml.txt @@ -18,7 +18,6 @@ PyYAML resampy scikit_learn scipy -SoundFile tensorboard tqdm wave diff --git a/requirements/gui.txt b/requirements/gui.txt index 55e7bd96..b6af49dd 100644 --- a/requirements/gui.txt +++ b/requirements/gui.txt @@ -18,7 +18,6 @@ PyYAML resampy scikit_learn scipy -SoundFile tensorboard tqdm wave diff --git a/requirements/ipex.txt b/requirements/ipex.txt index 5a55fb84..e7b0d594 100644 --- a/requirements/ipex.txt +++ b/requirements/ipex.txt @@ -14,7 +14,6 @@ faiss-cpu==1.7.3 gradio Cython pydub>=0.25.1 -soundfile>=0.12.1 tensorboardX Jinja2>=3.1.2 json5 diff --git a/requirements/main.txt b/requirements/main.txt index 3f83548b..9f23f694 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -9,7 +9,6 @@ faiss-cpu gradio Cython pydub>=0.25.1 -soundfile>=0.12.1 tensorboardX Jinja2>=3.1.2 json5 diff --git a/requirements/py311.txt b/requirements/py311.txt index 6547a802..fef9c3c7 100644 --- a/requirements/py311.txt +++ b/requirements/py311.txt @@ -9,7 +9,6 @@ faiss-cpu gradio Cython pydub>=0.25.1 -soundfile>=0.12.1 tensorboardX Jinja2>=3.1.2 json5 diff --git a/tools/cmd/infer-pm-index256.py b/tools/cmd/infer-pm-index256.py index a9905548..3993b847 100644 --- a/tools/cmd/infer-pm-index256.py +++ b/tools/cmd/infer-pm-index256.py @@ -18,7 +18,6 @@ # import pyworld import librosa import numpy as np -import soundfile as sf import torch.nn.functional as F from fairseq import checkpoint_utils @@ -33,6 +32,7 @@ # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf +from infer.lib.audio import load_audio device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" # @@ -132,7 +132,7 @@ def get_f0(x, p_len, f0_up_key=0): ): ## wav_path = "todo-songs/%s" % name # f0_up_key = -2 # - audio, sampling_rate = sf.read(wav_path) + audio, sampling_rate = load_audio(wav_path) if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 16000: diff --git a/tools/cmd/onnx/infer.py b/tools/cmd/onnx/infer.py index 41364730..d9c43117 100644 --- a/tools/cmd/onnx/infer.py +++ b/tools/cmd/onnx/infer.py @@ -1,8 +1,9 @@ -import soundfile import librosa from rvc.onnx import RVC +from infer.lib.audio import save_audio + hop_size = 512 sampling_rate = 40000 # 采样率 f0_up_key = 0 # 升降调 @@ -19,4 +20,4 @@ audio = model.infer(wav, sr, sampling_rate, sid, f0_method, f0_up_key) -soundfile.write(out_path, audio, sampling_rate) +save_audio(out_path, audio, sampling_rate) \ No newline at end of file diff --git a/web.py b/web.py index 31e506c3..6d3531ce 100644 --- a/web.py +++ b/web.py @@ -141,8 +141,8 @@ def forward_dml(ctx, x, scale): index_root = os.getenv("index_root") outside_index_root = os.getenv("outside_index_root") -names = [] -index_paths = [] +names = [""] +index_paths = [""] def lookup_names(weight_root): global names @@ -168,9 +168,9 @@ def lookup_indices(index_root): def change_choices(): global index_paths, names - names = [] + names = [""] lookup_names(weight_root) - index_paths = [] + index_paths = [""] lookup_indices(index_root) lookup_indices(outside_index_root) return {"choices": sorted(names), "__type__": "update"}, {