fumiama
diff --git a/‎infer/lib/audio.py‎
Lines changed: 41 additions & 9 deletions b/‎infer/lib/audio.py‎
Lines changed: 41 additions & 9 deletions
diff --git a/‎infer/lib/slicer2.py‎
Lines changed: 7 additions & 12 deletions b/‎infer/lib/slicer2.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎infer/modules/train/extract_feature_print.py‎
Lines changed: 6 additions & 2 deletions b/‎infer/modules/train/extract_feature_print.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎infer/modules/train/preprocess.py‎
Lines changed: 10 additions & 16 deletions b/‎infer/modules/train/preprocess.py‎
Lines changed: 10 additions & 16 deletions
diff --git a/‎infer/modules/uvr5/mdxnet.py‎
Lines changed: 7 additions & 11 deletions b/‎infer/modules/uvr5/mdxnet.py‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎infer/modules/uvr5/modules.py‎
Lines changed: 6 additions & 15 deletions b/‎infer/modules/uvr5/modules.py‎
Lines changed: 6 additions & 15 deletions
@@ -1,9 +1,12 @@
 from io import BufferedWriter, BytesIO
 from pathlib import Path
-from typing import Dict, Tuple
+from typing import Dict, Tuple, Optional, Union
 import os
+import math
+import wave
 
 import numpy as np
+from numba import jit
 import av
 from av.audio.resampler import AudioResampler
 
@@ -17,6 +20,26 @@
 }
 
 
+@jit(nopython=True)
+def float_to_int16(audio: np.ndarray) -> np.ndarray:
+    am = int(math.ceil(float(np.abs(audio).max())) * 32768)
+    am = 32767 * 32768 // am
+    return np.multiply(audio, am).astype(np.int16)
+
+def float_np_array_to_wav_buf(wav: np.ndarray, sr: int) -> BytesIO:
+    buf = BytesIO()
+    with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1)  # Mono channel
+        wf.setsampwidth(2)  # Sample width in bytes
+        wf.setframerate(sr)  # Sample rate in Hz
+        wf.writeframes(float_to_int16(wav))
+    buf.seek(0, 0)
+    return buf
+
+def save_audio(path: str, audio: np.ndarray, sr: int):
+    with open(path, "wb") as f:
+        f.write(float_np_array_to_wav_buf(audio, sr).getbuffer())
+
 def wav2(i: BytesIO, o: BufferedWriter, format: str):
     inp = av.open(i, "r")
     format = video_format_dict.get(format, format)
@@ -36,24 +59,28 @@ def wav2(i: BytesIO, o: BufferedWriter, format: str):
     inp.close()
 
 
-def load_audio(file: str, sr: int) -> np.ndarray:
-    if not Path(file).exists():
+def load_audio(file: Union[str, BytesIO, Path], sr: Optional[int]=None, format: Optional[str]=None) -> Union[np.ndarray, Tuple[np.ndarray, int]]:
+    """
+    load audio to mono channel
+    """
+    if (isinstance(file, str) and not Path(file).exists()) or (isinstance(file, Path) and not file.exists()):
         raise FileNotFoundError(f"File not found: {file}")
-
+    rate = 0
     try:
-        container = av.open(file)
+        container = av.open(file, format=format)
         resampler = AudioResampler(format="fltp", layout="mono", rate=sr)
 
         # Estimated maximum total number of samples to pre-allocate the array
         # AV stores length in microseconds by default
-        estimated_total_samples = int(container.duration * sr // 1_000_000)
+        estimated_total_samples = int(container.duration * sr // 1_000_000) if sr is not None else 48000
         decoded_audio = np.zeros(estimated_total_samples + 1, dtype=np.float32)
 
         offset = 0
         for frame in container.decode(audio=0):
             frame.pts = None  # Clear presentation timestamp to avoid resampling issues
             resampled_frames = resampler.resample(frame)
             for resampled_frame in resampled_frames:
+                rate = resampled_frame.rate
                 frame_data = resampled_frame.to_ndarray()[0]
                 end_index = offset + len(frame_data)
 
@@ -69,10 +96,15 @@ def load_audio(file: str, sr: int) -> np.ndarray:
     except Exception as e:
         raise RuntimeError(f"Failed to load audio: {e}")
 
-    return decoded_audio
+    if sr is not None:
+        return decoded_audio
+    return decoded_audio, rate
 
 
-def downsample_audio(input_path: str, output_path: str, format: str) -> None:
+def downsample_audio(input_path: str, output_path: str, format: str, br=128_000) -> None:
+    """
+    default to 128kb/s (equivalent to -q:a 2)
+    """
     if not os.path.exists(input_path):
         return
 
@@ -83,7 +115,7 @@ def downsample_audio(input_path: str, output_path: str, format: str) -> None:
     input_stream = input_container.streams.audio[0]
     output_stream = output_container.add_stream(format)
 
-    output_stream.bit_rate = 128_000  # 128kb/s (equivalent to -q:a 2)
+    output_stream.bit_rate = br
 
     # Copy packets from the input file to the output file
     for packet in input_container.demux(input_stream):
 
@@ -183,8 +183,7 @@ def main():
     import os.path
     from argparse import ArgumentParser
 
-    import librosa
-    import soundfile
+    from .audio import load_audio, save_audio
 
     parser = ArgumentParser()
     parser.add_argument("audio", type=str, help="The audio to be sliced")
@@ -230,7 +229,7 @@ def main():
     out = args.out
     if out is None:
         out = os.path.dirname(os.path.abspath(args.audio))
-    audio, sr = librosa.load(args.audio, sr=None, mono=False)
+    audio, sr = load_audio(args.audio)
     slicer = Slicer(
         sr=sr,
         threshold=args.db_thresh,
@@ -245,15 +244,11 @@ def main():
     for i, chunk in enumerate(chunks):
         if len(chunk.shape) > 1:
             chunk = chunk.T
-        soundfile.write(
-            os.path.join(
-                out,
-                f"%s_%d.wav"
-                % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
-            ),
-            chunk,
-            sr,
-        )
+        save_audio(os.path.join(
+            out,
+            f"%s_%d.wav"
+            % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
+        ), chunk, sr)
 
 
 if __name__ == "__main__":
 
@@ -2,6 +2,11 @@
 import sys
 import traceback
 
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+
+from infer.lib.audio import load_audio
+
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
 
@@ -20,7 +25,6 @@
     is_half = sys.argv[7].lower() == "true"
 import fairseq
 import numpy as np
-import soundfile as sf
 import torch
 import torch.nn.functional as F
 
@@ -64,7 +68,7 @@ def printt(strr):
 
 # wave must be 16k, hop_size=320
 def readwave(wav_path, normalize=False):
-    wav, sr = sf.read(wav_path)
+    wav, sr = load_audio(wav_path)
     assert sr == 16000
     feats = torch.from_numpy(wav).float()
     if feats.dim() == 2:  # double channels
 
@@ -16,11 +16,9 @@
 import os
 import traceback
 
-import librosa
 import numpy as np
-from scipy.io import wavfile
 
-from infer.lib.audio import load_audio
+from infer.lib.audio import load_audio, float_np_array_to_wav_buf, save_audio
 from infer.lib.slicer2 import Slicer
 
 f = open("%s/preprocess.log" % exp_dir, "a+")
@@ -64,19 +62,15 @@ def norm_write(self, tmp_audio, idx0, idx1):
         tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
             1 - self.alpha
         ) * tmp_audio
-        wavfile.write(
-            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
-            self.sr,
-            tmp_audio.astype(np.float32),
-        )
-        tmp_audio = librosa.resample(
-            tmp_audio, orig_sr=self.sr, target_sr=16000
-        )  # , res_type="soxr_vhq"
-        wavfile.write(
-            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
-            16000,
-            tmp_audio.astype(np.float32),
-        )
+        save_audio("%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), tmp_audio, self.sr)  
+        with open("%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), "wb") as f:
+            f.write(float_np_array_to_wav_buf(
+                load_audio(
+                    float_np_array_to_wav_buf(tmp_audio, self.sr),
+                    sr=16000,
+                    format="wav"
+                )
+            , 16000).getbuffer())
 
     def pipeline(self, path, idx0):
         try:
 
@@ -5,12 +5,10 @@
 
 import librosa
 import numpy as np
-import soundfile as sf
 import torch
 from tqdm import tqdm
-import av
 
-from infer.lib.audio import downsample_audio
+from infer.lib.audio import downsample_audio, save_audio
 
 cpu = torch.device("cpu")
 
@@ -210,15 +208,13 @@ def prediction(self, m, vocal_root, others_root, format):
         sources = self.demix(mix.T)
         opt = sources[0].T
         if format in ["wav", "flac"]:
-            sf.write(
-                "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate
-            )
-            sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate)
+            save_audio("%s/vocal_%s.%s" % (vocal_root, basename, format), mix - opt, rate)
+            save_audio("%s/instrument_%s.%s" % (others_root, basename, format), opt, rate)
         else:
-            path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename)
-            path_other = "%s/%s_others.wav" % (others_root, basename)
-            sf.write(path_vocal, mix - opt, rate)
-            sf.write(path_other, opt, rate)
+            path_vocal = "%s/vocal_%s.wav" % (vocal_root, basename)
+            path_other = "%s/instrument_%s.wav" % (others_root, basename)
+            save_audio(path_vocal, opt, rate)
+            save_audio(path_other, opt, rate)
             opt_path_vocal = path_vocal[:-4] + ".%s" % format
             opt_path_other = path_other[:-4] + ".%s" % format
             downsample_audio(path_vocal, opt_path_vocal, format)
 
@@ -9,7 +9,7 @@
 
 from configs import Config
 from infer.modules.uvr5.mdxnet import MDXNetDereverb
-from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho
+from infer.modules.uvr5.vr import AudioPre
 
 config = Config()
 
@@ -27,8 +27,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
         if model_name == "onnx_dereverb_By_FoxJoy":
             pre_fun = MDXNetDereverb(15, config.device)
         else:
-            func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho
-            pre_fun = func(
+            pre_fun = AudioPre(
                 agg=int(agg),
                 model_path=os.path.join(
                     os.getenv("weight_uvr5_root"), model_name + ".pth"
@@ -72,18 +71,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
                 infos.append("%s->Success" % (os.path.basename(inp_path)))
                 yield "\n".join(infos)
             except:
-                try:
-                    if done == 0:
-                        pre_fun._path_audio_(
-                            inp_path, save_root_ins, save_root_vocal, format0
-                        )
-                    infos.append("%s->Success" % (os.path.basename(inp_path)))
-                    yield "\n".join(infos)
-                except:
-                    infos.append(
-                        "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
-                    )
-                    yield "\n".join(infos)
+                infos.append(
+                    "%s->%s" % (os.path.basename(inp_path), traceback.format_exc())
+                )
+                yield "\n".join(infos)
     except:
         infos.append(traceback.format_exc())
         yield "\n".join(infos)