fix output

rakuri255 · rakuri255 · commit b900022abe12 · 2026-02-15T02:14:02.000+01:00
diff --git a/.github/workflows/build-container-image.yml b/.github/workflows/build-container-image.yml
@@ -19,7 +19,7 @@ jobs:
       id-token: write
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
       - name: Inject enhanced GitHub environment variables
         uses: rlespinasse/github-slug-action@v5 # https://github.com/rlespinasse/github-slug-action
       - name: lowercase IMAGE_REGISTRY
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -27,10 +27,10 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
 
     - name: Set up Python 3.12
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
 
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -14,10 +14,10 @@ jobs:
         python-version: ["3.10"]
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
 
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -11,10 +11,10 @@ jobs:
 
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python-version }}
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,10 +30,6 @@ dependencies = [
     "pyhyphen",
     "tqdm",
     "yt-dlp",
-    "isort",
-    "black",
-    "pylint",
-    "pytest",
     "music21",
     "dataclasses",
     "dataclasses-json",
@@ -46,10 +42,10 @@ dependencies = [
 windows = []
 linux = []
 macos = []
-dev = ["pytest"]
+dev = ["pytest", "isort", "black", "pylint"]
 
 [dependency-groups]
-dev = ["pytest"]
+dev = ["pytest", "isort", "black", "pylint"]
 
 [tool.hatch.build.targets.wheel]
 packages = ["src"]
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
@@ -20,7 +20,7 @@
 from modules.Audio.key_detector import detect_key_from_audio, get_allowed_notes_for_key
 from modules.Audio.silence_processing import remove_silence_from_transcription_data, mute_no_singing_parts
 from modules.Audio.separation import DemucsModel
-from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
+from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_audio_format
 from modules.Audio.youtube import (
     download_from_youtube,
 )
@@ -518,14 +518,14 @@ def CreateUltraStarTxt(process_data: ProcessData):
     if settings.create_karaoke and version.parse(settings.format_version.value) < version.parse(
             FormatVersion.V1_1_0.value):
         karaoke_output_path = os.path.join(settings.output_folder_path, process_data.basename + " [Karaoke]." + process_data.media_info.audio_extension)
-        convert_wav_to_mp3(process_data.process_data_paths.instrumental_audio_file_path, karaoke_output_path)
+        convert_audio_format(process_data.process_data_paths.instrumental_audio_file_path, karaoke_output_path)
 
     if version.parse(settings.format_version.value) >= version.parse(FormatVersion.V1_1_0.value):
         instrumental_output_path = os.path.join(settings.output_folder_path,
                                                 process_data.basename + " [Instrumental]." + process_data.media_info.audio_extension)
-        convert_wav_to_mp3(process_data.process_data_paths.instrumental_audio_file_path, instrumental_output_path)
+        convert_audio_format(process_data.process_data_paths.instrumental_audio_file_path, instrumental_output_path)
         vocals_output_path = os.path.join(settings.output_folder_path, process_data.basename + " [Vocals]." + process_data.media_info.audio_extension)
-        convert_wav_to_mp3(process_data.process_data_paths.vocals_audio_file_path, vocals_output_path)
+        convert_audio_format(process_data.process_data_paths.vocals_audio_file_path, vocals_output_path)
 
     # Create Ultrastar txt
     if not settings.ignore_audio:
diff --git a/src/modules/Audio/convert_audio.py b/src/modules/Audio/convert_audio.py
@@ -1,6 +1,7 @@
 """Convert audio to other formats"""
 
-from pydub import AudioSegment
+import subprocess
+import os
 import librosa
 import soundfile as sf
 
@@ -14,9 +15,27 @@ def convert_audio_to_mono_wav(input_file_path: str, output_file_path: str) -> No
     sf.write(output_file_path, y, sr)
 
 
-def convert_wav_to_mp3(input_file_path: str, output_file_path: str) -> None:
-    """Convert wav to mp3"""
-    print(f"{ULTRASINGER_HEAD} Converting wav to mp3. -> {output_file_path}")
+def convert_audio_format(input_file_path: str, output_file_path: str) -> None:
+    """Convert audio to the format specified by the output file extension using ffmpeg"""
+    output_format = os.path.splitext(output_file_path)[1].lstrip('.')
 
-    sound = AudioSegment.from_wav(input_file_path)
-    sound.export(output_file_path, format="mp3")
+    print(f"{ULTRASINGER_HEAD} Converting audio to {output_format}. -> {output_file_path}")
+    # todo: makes it sense to reencode here? Its only used for Instrumental and Vocal
+    # Use ffmpeg for audio conversion
+    # -i: input file
+    # -y: overwrite output file without asking
+    # -loglevel error: only show errors
+    # -q:a 0: best quality for VBR formats (mp3, ogg)
+    # -codec:a copy would be fastest but only works if formats match
+    cmd = [
+        "ffmpeg",
+        "-i", input_file_path,
+        "-y",
+        "-loglevel", "error",
+        "-q:a", "0",
+        output_file_path
+    ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"FFmpeg audio conversion failed: {result.stderr}")
diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py
@@ -14,9 +14,8 @@ def _get_detector():
     global _swift_f0_detector
     if _swift_f0_detector is None:
         # Initialize for general music/speech (wide frequency range) fmin=46.875, fmax=2093.75
-        # fixme: is this correct?
         # For speech only: fmin=65, fmax=400
-        _swift_f0_detector = SwiftF0(fmin=65, fmax=400, confidence_threshold=0.9)
+        _swift_f0_detector = SwiftF0(fmin=46.875, fmax=2093.75, confidence_threshold=0.9)
     return _swift_f0_detector
 
 
@@ -34,9 +33,21 @@ def get_pitch_with_file(
     if len(audio.shape) > 1:
         audio = np.mean(audio, axis=1)
 
-    # Normalize audio to float if needed
-    if audio.dtype != np.float32 and audio.dtype != np.float64:
-        audio = audio.astype(np.float32) / (2**15)
+    # Normalize audio to float32 based on dtype
+    if audio.dtype == np.uint8:
+        # uint8: range [0, 255] -> subtract 128 and divide by 128
+        audio = (audio.astype(np.float32) - 128.0) / 128.0
+    elif audio.dtype in [np.int16, np.int32, np.int64]:
+        # Signed integers: use iinfo to get max value and normalize
+        dtype_info = np.iinfo(audio.dtype)
+        max_val = max(abs(dtype_info.min), abs(dtype_info.max))
+        audio = audio.astype(np.float32) / float(max_val)
+    elif audio.dtype == np.float64:
+        # float64: cast to float32
+        audio = audio.astype(np.float32)
+    elif audio.dtype != np.float32:
+        # Fallback for other types: assume int16 range
+        audio = audio.astype(np.float32) / 32768.0
 
     return get_pitch_with_swift_f0(audio, sample_rate)
 
diff --git a/src/modules/Speech_Recognition/Whisper.py b/src/modules/Speech_Recognition/Whisper.py
@@ -117,10 +117,6 @@ def transcribe_with_whisper(
             model.value, language=language, device=device, compute_type=compute_type
         )
 
-        # Restore original torch.load after models are loaded
-        # This ensures other modules (like pitch detection) are not affected by the monkey-patch
-        torch.load = _original_torch_load
-
         audio = whisperx.load_audio(audio_path)
 
         print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
@@ -164,6 +160,10 @@ def transcribe_with_whisper(
 
         transcribed_data = convert_to_transcribed_data(result_aligned)
 
+        # Restore original torch.load after models are loaded
+        # This ensures other modules (like pitch detection) are not affected by the monkey-patch
+        torch.load = _original_torch_load
+
         return TranscriptionResult(transcribed_data, detected_language)
     except ValueError as value_error:
         # Restore original torch.load in case of error
diff --git a/src/modules/Ultrastar/coverter/ultrastar_txt_converter.py b/src/modules/Ultrastar/coverter/ultrastar_txt_converter.py
@@ -91,7 +91,8 @@ def create_ultrastar_txt_from_automation(
     if media_info.cover_url is not None:
         ultrastar_txt.coverUrl = media_info.cover_url
     if media_info.music_key is not None:
-        ultrastar_txt.tags = media_info.music_key
+        # todo: as list add here?
+        ultrastar_txt.tags = f"key: {media_info.music_key}"
 
     ultrastar_file_output_path = os.path.join(song_folder_output_path, basename + ".txt")
     create_ultrastar_txt(
diff --git a/src/modules/Ultrastar/ultrastar_parser.py b/src/modules/Ultrastar/ultrastar_parser.py
@@ -47,8 +47,6 @@ def parse(input_file: str) -> UltrastarTxtValue:
                 ultrastar_class.gap = line.split(":")[1].replace("\n", "")
             elif line.startswith(f"#{UltrastarTxtTag.BPM.value}"):
                 ultrastar_class.bpm = line.split(":")[1].replace("\n", "")
-            elif line.startswith(f"#{UltrastarTxtTag.VIDEO.value}"):
-                ultrastar_class.video = line.split(":")[1].replace("\n", "")
             elif line.startswith(f"#{UltrastarTxtTag.VIDEOGAP.value}"):
                 ultrastar_class.videoGap = line.split(":")[1].replace("\n", "")
             elif line.startswith(f"#{UltrastarTxtTag.COVER.value}"):
diff --git a/src/modules/Ultrastar/ultrastar_writer.py b/src/modules/Ultrastar/ultrastar_writer.py
@@ -76,8 +76,6 @@ def create_ultrastar_txt(
                 file.write(f"#{UltrastarTxtTag.VOCALS.value}:{ultrastar_class.vocals}\n")
             if ultrastar_class.instrumental is not None:
                 file.write(f"#{UltrastarTxtTag.INSTRUMENTAL.value}:{ultrastar_class.instrumental}\n")
-            if ultrastar_class.tags is not None:
-                file.write(f"#{UltrastarTxtTag.TAGS.value}:{ultrastar_class.tags}\n")
         if ultrastar_class.video is not None:
             file.write(f"#{UltrastarTxtTag.VIDEO.value}:{ultrastar_class.video}\n")
         if ultrastar_class.videoGap is not None:
@@ -87,6 +85,9 @@ def create_ultrastar_txt(
                 file.write(f"#{UltrastarTxtTag.VIDEOURL.value}:{ultrastar_class.videoUrl}\n")
         file.write(f"#{UltrastarTxtTag.BPM.value}:{round(ultrastar_bpm, 2)}\n")  # not the real BPM!
         file.write(f"#{UltrastarTxtTag.GAP.value}:{int(gap * 1000)}\n")
+        if version.parse(ultrastar_class.version) >= version.parse("1.1.0"):
+            if ultrastar_class.tags is not None:
+                file.write(f"#{UltrastarTxtTag.TAGS.value}:{ultrastar_class.tags}\n")
         file.write(f"#{UltrastarTxtTag.CREATOR.value}:{ultrastar_class.creator}\n")
         file.write(f"#{UltrastarTxtTag.COMMENT.value}:{ultrastar_class.comment}\n")
 
diff --git a/src/modules/ffmpeg_helper.py b/src/modules/ffmpeg_helper.py
@@ -127,7 +127,7 @@ def get_audio_codec_and_extension(video_file_path: str) -> str:
             "eac3": "eac3",
         }
 
-        extension = codec_to_extension.get(codec_name, ".wav")
+        extension = codec_to_extension.get(codec_name, "wav")
         return extension
 
     except Exception:
@@ -142,8 +142,9 @@ def separate_audio_video(video_with_audio_path: str, basename_without_ext: str,
     """
     from modules.console_colors import ULTRASINGER_HEAD
 
-    # Get original video file extension
-    _, video_ext = os.path.splitext(video_with_audio_path)
+    # Get original video file extension without the dot
+    _, video_ext_with_dot = os.path.splitext(video_with_audio_path)
+    video_ext = video_ext_with_dot.lstrip('.')
 
     # Detect audio codec and get appropriate extension
     audio_ext = get_audio_codec_and_extension(video_with_audio_path)
@@ -153,14 +154,14 @@ def separate_audio_video(video_with_audio_path: str, basename_without_ext: str,
     extract_audio(video_with_audio_path, audio_file_path)
 
     print(f"{ULTRASINGER_HEAD} Creating video without audio")
-    video_only_path = os.path.join(output_folder, f"{basename_without_ext}_video{video_ext}")
+    video_only_path = os.path.join(output_folder, f"{basename_without_ext}_video.{video_ext}")
     remove_audio_from_video(video_with_audio_path, video_only_path)
 
     # Remove original video with audio
     os.remove(video_with_audio_path)
 
     # Rename video without audio to final name
-    final_video_path = os.path.join(output_folder, f"{basename_without_ext}{video_ext}")
+    final_video_path = os.path.join(output_folder, f"{basename_without_ext}.{video_ext}")
     os.rename(video_only_path, final_video_path)
 
     return audio_file_path, final_video_path, audio_ext, video_ext