Skip to content

Commit b900022

Browse files
committed
fix output
1 parent 1fd9bc6 commit b900022

File tree

13 files changed

+69
-42
lines changed

13 files changed

+69
-42
lines changed

.github/workflows/build-container-image.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
id-token: write
2020
steps:
2121
- name: Checkout
22-
uses: actions/checkout@v4
22+
uses: actions/checkout@v6
2323
- name: Inject enhanced GitHub environment variables
2424
uses: rlespinasse/github-slug-action@v5 # https://github.com/rlespinasse/github-slug-action
2525
- name: lowercase IMAGE_REGISTRY

.github/workflows/main.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@ jobs:
2727

2828
steps:
2929
- name: Checkout repository
30-
uses: actions/checkout@v3
30+
uses: actions/checkout@v6
3131

3232
- name: Set up Python 3.12
33-
uses: actions/setup-python@v4
33+
uses: actions/setup-python@v6
3434
with:
3535
python-version: ${{ matrix.python-version }}
3636

.github/workflows/pylint.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ jobs:
1414
python-version: ["3.10"]
1515
steps:
1616
- name: Checkout code
17-
uses: actions/checkout@v3
17+
uses: actions/checkout@v6
1818

1919
- name: Set up Python ${{ matrix.python-version }}
20-
uses: actions/setup-python@v3
20+
uses: actions/setup-python@v6
2121
with:
2222
python-version: ${{ matrix.python-version }}
2323

.github/workflows/pytest.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ jobs:
1111

1212
steps:
1313
- name: Checkout code
14-
uses: actions/checkout@v3
14+
uses: actions/checkout@v6
1515

1616
- name: Set up Python ${{ matrix.python-version }}
17-
uses: actions/setup-python@v4
17+
uses: actions/setup-python@v6
1818
with:
1919
python-version: ${{ matrix.python-version }}
2020

pyproject.toml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@ dependencies = [
3030
"pyhyphen",
3131
"tqdm",
3232
"yt-dlp",
33-
"isort",
34-
"black",
35-
"pylint",
36-
"pytest",
3733
"music21",
3834
"dataclasses",
3935
"dataclasses-json",
@@ -46,10 +42,10 @@ dependencies = [
4642
windows = []
4743
linux = []
4844
macos = []
49-
dev = ["pytest"]
45+
dev = ["pytest", "isort", "black", "pylint"]
5046

5147
[dependency-groups]
52-
dev = ["pytest"]
48+
dev = ["pytest", "isort", "black", "pylint"]
5349

5450
[tool.hatch.build.targets.wheel]
5551
packages = ["src"]

src/UltraSinger.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from modules.Audio.key_detector import detect_key_from_audio, get_allowed_notes_for_key
2121
from modules.Audio.silence_processing import remove_silence_from_transcription_data, mute_no_singing_parts
2222
from modules.Audio.separation import DemucsModel
23-
from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_wav_to_mp3
23+
from modules.Audio.convert_audio import convert_audio_to_mono_wav, convert_audio_format
2424
from modules.Audio.youtube import (
2525
download_from_youtube,
2626
)
@@ -518,14 +518,14 @@ def CreateUltraStarTxt(process_data: ProcessData):
518518
if settings.create_karaoke and version.parse(settings.format_version.value) < version.parse(
519519
FormatVersion.V1_1_0.value):
520520
karaoke_output_path = os.path.join(settings.output_folder_path, process_data.basename + " [Karaoke]." + process_data.media_info.audio_extension)
521-
convert_wav_to_mp3(process_data.process_data_paths.instrumental_audio_file_path, karaoke_output_path)
521+
convert_audio_format(process_data.process_data_paths.instrumental_audio_file_path, karaoke_output_path)
522522

523523
if version.parse(settings.format_version.value) >= version.parse(FormatVersion.V1_1_0.value):
524524
instrumental_output_path = os.path.join(settings.output_folder_path,
525525
process_data.basename + " [Instrumental]." + process_data.media_info.audio_extension)
526-
convert_wav_to_mp3(process_data.process_data_paths.instrumental_audio_file_path, instrumental_output_path)
526+
convert_audio_format(process_data.process_data_paths.instrumental_audio_file_path, instrumental_output_path)
527527
vocals_output_path = os.path.join(settings.output_folder_path, process_data.basename + " [Vocals]." + process_data.media_info.audio_extension)
528-
convert_wav_to_mp3(process_data.process_data_paths.vocals_audio_file_path, vocals_output_path)
528+
convert_audio_format(process_data.process_data_paths.vocals_audio_file_path, vocals_output_path)
529529

530530
# Create Ultrastar txt
531531
if not settings.ignore_audio:

src/modules/Audio/convert_audio.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Convert audio to other formats"""
22

3-
from pydub import AudioSegment
3+
import subprocess
4+
import os
45
import librosa
56
import soundfile as sf
67

@@ -14,9 +15,27 @@ def convert_audio_to_mono_wav(input_file_path: str, output_file_path: str) -> No
1415
sf.write(output_file_path, y, sr)
1516

1617

17-
def convert_wav_to_mp3(input_file_path: str, output_file_path: str) -> None:
18-
"""Convert wav to mp3"""
19-
print(f"{ULTRASINGER_HEAD} Converting wav to mp3. -> {output_file_path}")
18+
def convert_audio_format(input_file_path: str, output_file_path: str) -> None:
19+
"""Convert audio to the format specified by the output file extension using ffmpeg"""
20+
output_format = os.path.splitext(output_file_path)[1].lstrip('.')
2021

21-
sound = AudioSegment.from_wav(input_file_path)
22-
sound.export(output_file_path, format="mp3")
22+
print(f"{ULTRASINGER_HEAD} Converting audio to {output_format}. -> {output_file_path}")
23+
# todo: makes it sense to reencode here? Its only used for Instrumental and Vocal
24+
# Use ffmpeg for audio conversion
25+
# -i: input file
26+
# -y: overwrite output file without asking
27+
# -loglevel error: only show errors
28+
# -q:a 0: best quality for VBR formats (mp3, ogg)
29+
# -codec:a copy would be fastest but only works if formats match
30+
cmd = [
31+
"ffmpeg",
32+
"-i", input_file_path,
33+
"-y",
34+
"-loglevel", "error",
35+
"-q:a", "0",
36+
output_file_path
37+
]
38+
39+
result = subprocess.run(cmd, capture_output=True, text=True)
40+
if result.returncode != 0:
41+
raise RuntimeError(f"FFmpeg audio conversion failed: {result.stderr}")

src/modules/Pitcher/pitcher.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,8 @@ def _get_detector():
1414
global _swift_f0_detector
1515
if _swift_f0_detector is None:
1616
# Initialize for general music/speech (wide frequency range) fmin=46.875, fmax=2093.75
17-
# fixme: is this correct?
1817
# For speech only: fmin=65, fmax=400
19-
_swift_f0_detector = SwiftF0(fmin=65, fmax=400, confidence_threshold=0.9)
18+
_swift_f0_detector = SwiftF0(fmin=46.875, fmax=2093.75, confidence_threshold=0.9)
2019
return _swift_f0_detector
2120

2221

@@ -34,9 +33,21 @@ def get_pitch_with_file(
3433
if len(audio.shape) > 1:
3534
audio = np.mean(audio, axis=1)
3635

37-
# Normalize audio to float if needed
38-
if audio.dtype != np.float32 and audio.dtype != np.float64:
39-
audio = audio.astype(np.float32) / (2**15)
36+
# Normalize audio to float32 based on dtype
37+
if audio.dtype == np.uint8:
38+
# uint8: range [0, 255] -> subtract 128 and divide by 128
39+
audio = (audio.astype(np.float32) - 128.0) / 128.0
40+
elif audio.dtype in [np.int16, np.int32, np.int64]:
41+
# Signed integers: use iinfo to get max value and normalize
42+
dtype_info = np.iinfo(audio.dtype)
43+
max_val = max(abs(dtype_info.min), abs(dtype_info.max))
44+
audio = audio.astype(np.float32) / float(max_val)
45+
elif audio.dtype == np.float64:
46+
# float64: cast to float32
47+
audio = audio.astype(np.float32)
48+
elif audio.dtype != np.float32:
49+
# Fallback for other types: assume int16 range
50+
audio = audio.astype(np.float32) / 32768.0
4051

4152
return get_pitch_with_swift_f0(audio, sample_rate)
4253

src/modules/Speech_Recognition/Whisper.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,6 @@ def transcribe_with_whisper(
117117
model.value, language=language, device=device, compute_type=compute_type
118118
)
119119

120-
# Restore original torch.load after models are loaded
121-
# This ensures other modules (like pitch detection) are not affected by the monkey-patch
122-
torch.load = _original_torch_load
123-
124120
audio = whisperx.load_audio(audio_path)
125121

126122
print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")
@@ -164,6 +160,10 @@ def transcribe_with_whisper(
164160

165161
transcribed_data = convert_to_transcribed_data(result_aligned)
166162

163+
# Restore original torch.load after models are loaded
164+
# This ensures other modules (like pitch detection) are not affected by the monkey-patch
165+
torch.load = _original_torch_load
166+
167167
return TranscriptionResult(transcribed_data, detected_language)
168168
except ValueError as value_error:
169169
# Restore original torch.load in case of error

src/modules/Ultrastar/coverter/ultrastar_txt_converter.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ def create_ultrastar_txt_from_automation(
9191
if media_info.cover_url is not None:
9292
ultrastar_txt.coverUrl = media_info.cover_url
9393
if media_info.music_key is not None:
94-
ultrastar_txt.tags = media_info.music_key
94+
# todo: as list add here?
95+
ultrastar_txt.tags = f"key: {media_info.music_key}"
9596

9697
ultrastar_file_output_path = os.path.join(song_folder_output_path, basename + ".txt")
9798
create_ultrastar_txt(

0 commit comments

Comments
 (0)