fumiama
diff --git a/‎infer/lib/audio.py
Lines changed: 11 additions & 45 deletions b/‎infer/lib/audio.py
Lines changed: 11 additions & 45 deletions
diff --git a/‎infer/lib/train/utils.py
Lines changed: 2 additions & 1 deletion b/‎infer/lib/train/utils.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎infer/lib/uvr5_pack/lib_v5/dataset.py
Lines changed: 0 additions & 183 deletions b/‎infer/lib/uvr5_pack/lib_v5/dataset.py
Lines changed: 0 additions & 183 deletions
diff --git a/‎infer/lib/uvr5_pack/lib_v5/layers.py
Lines changed: 8 additions & 3 deletions b/‎infer/lib/uvr5_pack/lib_v5/layers.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎infer/lib/uvr5_pack/lib_v5/nets.py
Lines changed: 3 additions & 20 deletions b/‎infer/lib/uvr5_pack/lib_v5/nets.py
Lines changed: 3 additions & 20 deletions
@@ -43,9 +43,14 @@ def float_np_array_to_wav_buf(wav: np.ndarray, sr: int, f32=False) -> BytesIO:
     return buf
 
 
-def save_audio(path: str, audio: np.ndarray, sr: int, f32=False):
+def save_audio(path: str, audio: np.ndarray, sr: int, f32=False, format="wav"):
+    buf = float_np_array_to_wav_buf(audio, sr, f32)
+    if format != "wav":
+        transbuf = BytesIO()
+        wav2(buf, transbuf, format)
+        buf = transbuf
     with open(path, "wb") as f:
-        f.write(float_np_array_to_wav_buf(audio, sr, f32).getbuffer())
+        f.write(buf.getbuffer())
 
 
 def wav2(i: BytesIO, o: BufferedWriter, format: str):
@@ -109,7 +114,7 @@ def process_packet(packet: List[AudioFrame]):
         frames_data = []
         rate = 0
         for frame in packet:
-            frame.pts = None  # 清除时间戳，避免重新采样问题
+            # frame.pts = None  # 清除时间戳，避免重新采样问题
             resampled_frames = (
                 resampler.resample(frame) if resampler is not None else [frame]
             )
@@ -137,6 +142,8 @@ def frame_iter(container):
 
             np.copyto(decoded_audio[..., offset:end_index], frame_data)
             offset += len(frame_data[0])
+    
+    container.close()
 
     # Truncate the array to the actual size
     decoded_audio = decoded_audio[..., :offset]
@@ -149,43 +156,6 @@ def frame_iter(container):
     return decoded_audio, rate
 
 
-def downsample_audio(
-    input_path: str, output_path: str, format: str, br=128_000
-) -> None:
-    """
-    default to 128kb/s (equivalent to -q:a 2)
-    """
-    if not os.path.exists(input_path):
-        return
-
-    input_container = av.open(input_path)
-    output_container = av.open(output_path, "w")
-
-    # Create a stream in the output container
-    input_stream = input_container.streams.audio[0]
-    output_stream = output_container.add_stream(format)
-
-    output_stream.bit_rate = br
-
-    # Copy packets from the input file to the output file
-    for packet in input_container.demux(input_stream):
-        for frame in packet.decode():
-            for out_packet in output_stream.encode(frame):
-                output_container.mux(out_packet)
-
-    for packet in output_stream.encode():
-        output_container.mux(packet)
-
-    # Close the containers
-    input_container.close()
-    output_container.close()
-
-    try:  # Remove the original file
-        os.remove(input_path)
-    except Exception as e:
-        print(f"Failed to remove the original file: {e}")
-
-
 def resample_audio(
     input_path: str, output_path: str, codec: str, format: str, sr: int, layout: str
 ) -> None:
@@ -204,7 +174,7 @@ def resample_audio(
     # Copy packets from the input file to the output file
     for packet in input_container.demux(input_stream):
         for frame in packet.decode():
-            frame.pts = None  # Clear presentation timestamp to avoid resampling issues
+            # frame.pts = None  # Clear presentation timestamp to avoid resampling issues
             out_frames = resampler.resample(frame)
             for out_frame in out_frames:
                 for out_packet in output_stream.encode(out_frame):
@@ -217,10 +187,6 @@ def resample_audio(
     input_container.close()
     output_container.close()
 
-    try:  # Remove the original file
-        os.remove(input_path)
-    except Exception as e:
-        print(f"Failed to remove the original file: {e}")
 
 
 def get_audio_properties(input_path: str) -> Tuple[int, int]:
 
@@ -5,6 +5,7 @@
 import os
 import sys
 from copy import deepcopy
+import math
 
 import codecs
 import numpy as np
@@ -103,7 +104,7 @@ def summarize(
 
 def latest_checkpoint_path(dir_path, regex="G_*.pth"):
     f_list = glob.glob(os.path.join(dir_path, regex))
-    f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
+    f_list.sort(key=lambda f: 999999999999 if isinstance(f, str) and f == "latest" else int("0"+"".join(filter(str.isdigit, f))))
     x = f_list[-1]
     logger.debug(x)
     return x
 
@@ -22,7 +22,8 @@ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReL
             activ(),
         )
 
-    def __call__(self, x):
+    @torch.inference_mode()
+    def forward(self, x):
         return self.conv(x)
 
 
@@ -32,7 +33,8 @@ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
         self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
         self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
 
-    def __call__(self, x):
+    @torch.inference_mode()
+    def forward(self, x):
         h = self.conv1(x)
         h = self.conv2(h)
 
@@ -48,7 +50,8 @@ def __init__(
         # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
         self.dropout = nn.Dropout2d(0.1) if dropout else None
 
-    def __call__(self, x, skip=None):
+    @torch.inference_mode()
+    def forward(self, x, skip=None):
         x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
 
         if skip is not None:
@@ -84,6 +87,7 @@ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False
         self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
         self.dropout = nn.Dropout2d(0.1) if dropout else None
 
+    @torch.inference_mode()
     def forward(self, x):
         _, _, h, w = x.size()
         feat1 = F.interpolate(
@@ -113,6 +117,7 @@ def __init__(self, nin_conv, nin_lstm, nout_lstm):
             nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
         )
 
+    @torch.inference_mode()
     def forward(self, x):
         N, _, nbins, nframes = x.size()
         h = self.conv(x)[:, 0]  # N, nbins, nframes
 
@@ -24,7 +24,8 @@ def __init__(
         self.lstm_dec2 = layers.LSTMModule(nout * 2, nin_lstm, nout_lstm)
         self.dec1 = layers.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
 
-    def __call__(self, x):
+    @torch.inference_mode()
+    def forward(self, x):
         e1 = self.enc1(x)
         e2 = self.enc2(e1)
         e3 = self.enc3(e2)
@@ -75,6 +76,7 @@ def __init__(self, n_fft, nout=32, nout_lstm=128):
         self.out = nn.Conv2d(nout, 2, 1, bias=False)
         self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
 
+    @torch.inference_mode()
     def forward(self, x):
         x = x[:, :, : self.max_bin]
 
@@ -112,22 +114,3 @@ def forward(self, x):
             return mask, aux
         else:
             return mask
-
-    def predict_mask(self, x):
-        mask = self.forward(x)
-
-        if self.offset > 0:
-            mask = mask[:, :, :, self.offset : -self.offset]
-            assert mask.size()[3] > 0
-
-        return mask
-
-    def predict(self, x, aggressiveness=None):
-        mask = self.forward(x)
-        pred_mag = x * mask
-
-        if self.offset > 0:
-            pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
-            assert pred_mag.size()[3] > 0
-
-        return pred_mag