Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
This application implements a pipeline that can be used to create audio datasets for the generation of stem continuations of music audio files. The code uses [Dask](https://www.dask.org/) in order to scale the dataset processing on a cluster of virtual machines in the cloud. The application is configured to run on AWS EC2 and to use S3 as storage. The audio files are encoded using Meta's [Encodec](https://github.com/facebookresearch/encodec) into a discrete, compressed, tokenized representation. Finally, the last step uploads the dataset to [ClearML](https://clear.ml) to be used for training and/or inference.

The dataset generation pipeline is comprised of several steps:
- **Stem**. Creates drums, bass, guitar and other stems starting from MP3 files using [Demucs](https://github.com/adefossez/demucs)
- **Uncompress**. The application expects to find the stem files for a single music file (in .wav format) in a compressed zip archive. Each stem should have a predefined name in order to be identified as a guitar, bass, drum, etc.
- **Convert to ogg**. Conversion of wav files to the Ogg Opus audio format.
- **Merge**. Several different assortments of stems are generated.
Expand Down
230 changes: 229 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ torch = "^2.5.1"
torchaudio = "^2.5.1"
torchvision = "^0.20.1"
accelerate = "^1.1.1"
demucs = "^4.0.1"

[tool.poetry.dev-dependencies]
flake8 = "^7.1.1"
Expand Down
4 changes: 4 additions & 0 deletions src/stem_continuation_dataset_generator/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
DASK_CLUSTER_NAME = 'stem-continuation-dataset-generator-cluster'


def get_whole_tracks_files_path():
return os.path.join(STORAGE_BUCKET_NAME, 'whole-tracks')


def get_original_files_path():
return os.path.join(STORAGE_BUCKET_NAME, 'original')

Expand Down
5 changes: 4 additions & 1 deletion src/stem_continuation_dataset_generator/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from stem_continuation_dataset_generator.constants import DATASET_TAGS, get_augmented_files_path, get_distorted_files_path, get_encoded_files_path, get_merged_files_path, get_original_files_path, get_split_files_path
from stem_continuation_dataset_generator.constants import DATASET_TAGS, get_augmented_files_path, get_distorted_files_path, get_encoded_files_path, get_merged_files_path
from stem_continuation_dataset_generator.constants import get_original_files_path, get_split_files_path, get_whole_tracks_files_path
from stem_continuation_dataset_generator.steps.augment import augment_all
from stem_continuation_dataset_generator.steps.convert_to_ogg import convert_to_ogg
from stem_continuation_dataset_generator.steps.encode import encode_all
from stem_continuation_dataset_generator.steps.merge import assort_and_merge_all
from stem_continuation_dataset_generator.steps.split import split_all
from stem_continuation_dataset_generator.steps.stem import stem_all
from stem_continuation_dataset_generator.steps.uncompress import uncompress_files
from stem_continuation_dataset_generator.steps.upload import upload
from stem_continuation_dataset_generator.steps.distort import distort_all
Expand Down Expand Up @@ -33,6 +35,7 @@ def dataset_creation_pipeline(stem_name: str):

tags = DATASET_TAGS + [f'stem-{stem_name}']

stem_all(get_whole_tracks_files_path(), get_original_files_path())
assort_and_merge_all(get_original_files_path(), get_merged_files_path(stem_name), stem_name)
augment_all(get_merged_files_path(stem_name), get_augmented_files_path(stem_name))
distort_all(get_augmented_files_path(stem_name), get_distorted_files_path(stem_name))
Expand Down
19 changes: 7 additions & 12 deletions src/stem_continuation_dataset_generator/steps/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import os
import random
from typing import FrozenSet, List, Optional, Tuple, cast, Set
import librosa
from pydub import AudioSegment
from dask.distributed import progress, Client
from s3fs.core import S3FileSystem

from stem_continuation_dataset_generator.cluster import get_client
from stem_continuation_dataset_generator.constants import DEFAULT_STEM_NAME, get_merged_files_path, get_original_files_path
from stem_continuation_dataset_generator.utils.constants import get_random_seed
from stem_continuation_dataset_generator.utils.utils import is_mostly_silent

STEM_NAMES = ['guitar', 'drum', 'bass', 'perc', 'fx', 'vocals', 'piano', 'synth', 'winds', 'strings']
BASIC_STEM_NAMES = ['guitar', 'drum', 'bass', 'perc', 'gtr', 'drm', 'piano']
Expand Down Expand Up @@ -133,22 +133,17 @@ def create_stems_assortments(other_stems: List[StemFile], current_stem_file: str
return [(current_stem_file, assortment) for assortment in assortments]


def is_mostly_silent(fs: S3FileSystem, file_path: str) -> bool:
with fs.open(file_path, 'rb') as file:

audio, sr = librosa.load(file) # type: ignore
no_of_samples = audio.shape[-1]
splits = librosa.effects.split(audio, top_db=60)
non_silent_samples = sum([end - start for (start, end) in splits])
return non_silent_samples / no_of_samples < MIN_PERCENTAGE_OF_AUDIO_IN_NON_SILENT_FILES


def get_stem(file_path: str, silent: bool) -> StemFile:
return StemFile(file_path=file_path, is_mostly_silent=silent)


def is_remote_file_mostly_silent(fs: S3FileSystem, file_path: str):
with fs.open(file_path, 'rb') as file:
return is_mostly_silent(cast(io.TextIOWrapper, file), MIN_PERCENTAGE_OF_AUDIO_IN_NON_SILENT_FILES)


def get_stems(fs: S3FileSystem, paths: List[str]) -> List[StemFile]:
return [get_stem(path, is_mostly_silent(fs, path)) for path in paths]
return [get_stem(path, is_remote_file_mostly_silent(fs, path)) for path in paths]


def assort(fs: S3FileSystem, directory: str, stem_name: str) -> List[List[Tuple[str, FrozenSet[str]]]]:
Expand Down
90 changes: 90 additions & 0 deletions src/stem_continuation_dataset_generator/steps/stem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import glob
import os
import shlex
import tempfile
from typing import List, Tuple, cast

from distributed import Client, progress
import demucs.separate
from s3fs.core import S3FileSystem

from stem_continuation_dataset_generator.cluster import get_client
from stem_continuation_dataset_generator.constants import get_original_files_path, get_whole_tracks_files_path
from stem_continuation_dataset_generator.steps.convert_to_ogg import convert_to_ogg
from stem_continuation_dataset_generator.utils.utils import is_mostly_silent


RUN_LOCALLY = False
PERCENTAGE_OF_NON_SILENT_AUDIO_FILE = 0.25
EXCLUDED_STEMS = ['piano', 'vocals'] # Piano and vocals stems produced by Demucs are low quality


def get_whole_track_files(fs: S3FileSystem, dir: str) -> List[str]:
return cast(List[str], fs.glob(os.path.join(dir, '**/*.mp3')))


def stem_file(output_directory: str, file_path: str) -> tuple[str, list[tuple[str, str]]]:
"""
Separates an audio file into its individual tracks using the Demucs model.

This function takes an audio file as input, separates it into its individual tracks using the Demucs model,
and returns the directory where the separated tracks are stored along with a list of tuples containing the
instrument name of each track and its corresponding file path.

Args:
filename (str): The path to the audio file to be separated.

Returns:
tuple[str, list[tuple[str, str]]]: A tuple containing the directory path where the separated tracks are stored,
and a list of tuples where each tuple contains the instrument name of a track and its file path.
"""
demucs.separate.main(shlex.split(f'-n htdemucs_6s --clip-mode clamp --out "{output_directory}" "{file_path}"'))
return (output_directory, [(os.path.splitext(os.path.basename(filename))[0], filename) for filename in glob.glob(os.path.join(output_directory, '**', '*.wav'), recursive=True)])


def stem(params: Tuple[S3FileSystem, str, str, str, str]):
fs, file_path, artist, source_directory, base_output_directory = params

basename = os.path.basename(file_path)
song_name = basename.replace('.mp3', '')
output_directory = os.path.join(base_output_directory, artist, song_name)

with tempfile.TemporaryDirectory() as local_directory:
local_path = os.path.join(local_directory, basename)
fs.download(file_path, local_path)
stem_file(local_directory, local_path)
os.remove(local_path)
convert_to_ogg(local_directory)
ogg_files = glob.glob(os.path.join(local_directory, '**/*.ogg'), recursive=True)
for ogg_file in ogg_files:
if os.path.basename(ogg_file).split('.')[0] not in EXCLUDED_STEMS:
with open(ogg_file, 'rb') as file:
if not is_mostly_silent(file, PERCENTAGE_OF_NON_SILENT_AUDIO_FILE):
print(ogg_file)
fs.upload(ogg_file, os.path.join(output_directory, os.path.basename(ogg_file)))


def stem_all(source_directory: str, output_directory: str):

fs = S3FileSystem()
files = get_whole_track_files(fs, source_directory)
files_with_artist = [(file_path, os.path.dirname(file_path).split(os.path.sep)[-1]) for file_path in files]

client = cast(
Client,
get_client(
RUN_LOCALLY,
),
)

params_list: List[Tuple[S3FileSystem, str, str, str, str]] = [(fs, file_path, artist, source_directory, output_directory) for file_path, artist in files_with_artist]

print('Stemming audio tracks')
futures = client.map(stem, params_list, retries=2)
progress(futures)

return output_directory


if __name__ == '__main__':
stem_all(get_whole_tracks_files_path(), get_original_files_path())
10 changes: 10 additions & 0 deletions src/stem_continuation_dataset_generator/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import io
from clearml import Dataset
import numpy as np
import librosa
from typing import Union

from stem_continuation_dataset_generator.constants import CLEARML_DATASET_NAME
from stem_continuation_dataset_generator.utils.constants import get_clearml_project_name
Expand Down Expand Up @@ -40,3 +43,10 @@ def convert_audio_to_float_32(audio_data: np.ndarray) -> np.ndarray:
raw_data = audio_data / max_32bit
return raw_data.astype(np.float32)


def is_mostly_silent(file: Union[io.TextIOWrapper, io.BufferedReader], percentage_non_silent: float) -> bool:
audio, sr = librosa.load(file) # type: ignore
no_of_samples = audio.shape[-1]
splits = librosa.effects.split(audio, top_db=60)
non_silent_samples = sum([end - start for (start, end) in splits])
return non_silent_samples / no_of_samples < percentage_non_silent
Loading