Added a pipeline step for bulk stemming using Demucs

energydrink9 · energydrink9 · commit 0ecb5d2abb94 · 2024-12-16T22:44:43.000+11:00
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@
 This application implements a pipeline that can be used to create audio datasets for the generation of stem continuations of music audio files. The code uses [Dask](https://www.dask.org/) in order to scale the dataset processing on a cluster of virtual machines in the cloud. The application is configured to run on AWS EC2 and to use S3 as storage. The audio files are encoded using Meta's [Encodec](https://github.com/facebookresearch/encodec) into a discrete, compressed, tokenized representation. Finally, the last step uploads the dataset to [ClearML](https://clear.ml) to be used for training and/or inference.
 
 The dataset generation pipeline is comprised of several steps:
+- **Stem**. Creates drums, bass, guitar and other stems starting from MP3 files using [Demucs](https://github.com/adefossez/demucs)
 - **Uncompress**. The application expects to find the stem files for a single music file (in .wav format) in a compressed zip archive. Each stem should have a predefined name in order to be identified as a guitar, bass, drum, etc.
 - **Convert to ogg**. Conversion of wav files to the Ogg Opus audio format.
 - **Merge**. Several different assortments of stems are generated.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,6 +45,7 @@ torch = "^2.5.1"
 torchaudio = "^2.5.1"
 torchvision = "^0.20.1"
 accelerate = "^1.1.1"
+demucs = "^4.0.1"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^7.1.1"
diff --git a/src/stem_continuation_dataset_generator/constants.py b/src/stem_continuation_dataset_generator/constants.py
@@ -8,6 +8,10 @@
 DASK_CLUSTER_NAME = 'stem-continuation-dataset-generator-cluster'
 
 
+def get_whole_tracks_files_path():
+    return os.path.join(STORAGE_BUCKET_NAME, 'whole-tracks')
+
+
 def get_original_files_path():
     return os.path.join(STORAGE_BUCKET_NAME, 'original')
 
diff --git a/src/stem_continuation_dataset_generator/pipeline.py b/src/stem_continuation_dataset_generator/pipeline.py
@@ -1,9 +1,11 @@
-from stem_continuation_dataset_generator.constants import DATASET_TAGS, get_augmented_files_path, get_distorted_files_path, get_encoded_files_path, get_merged_files_path, get_original_files_path, get_split_files_path
+from stem_continuation_dataset_generator.constants import DATASET_TAGS, get_augmented_files_path, get_distorted_files_path, get_encoded_files_path, get_merged_files_path
+from stem_continuation_dataset_generator.constants import get_original_files_path, get_split_files_path, get_whole_tracks_files_path
 from stem_continuation_dataset_generator.steps.augment import augment_all
 from stem_continuation_dataset_generator.steps.convert_to_ogg import convert_to_ogg
 from stem_continuation_dataset_generator.steps.encode import encode_all
 from stem_continuation_dataset_generator.steps.merge import assort_and_merge_all
 from stem_continuation_dataset_generator.steps.split import split_all
+from stem_continuation_dataset_generator.steps.stem import stem_all
 from stem_continuation_dataset_generator.steps.uncompress import uncompress_files
 from stem_continuation_dataset_generator.steps.upload import upload
 from stem_continuation_dataset_generator.steps.distort import distort_all
@@ -33,6 +35,7 @@ def dataset_creation_pipeline(stem_name: str):
     
     tags = DATASET_TAGS + [f'stem-{stem_name}']
 
+    stem_all(get_whole_tracks_files_path(), get_original_files_path())
     assort_and_merge_all(get_original_files_path(), get_merged_files_path(stem_name), stem_name)
     augment_all(get_merged_files_path(stem_name), get_augmented_files_path(stem_name))
     distort_all(get_augmented_files_path(stem_name), get_distorted_files_path(stem_name))
diff --git a/src/stem_continuation_dataset_generator/steps/merge.py b/src/stem_continuation_dataset_generator/steps/merge.py
@@ -3,14 +3,14 @@
 import os
 import random
 from typing import FrozenSet, List, Optional, Tuple, cast, Set
-import librosa
 from pydub import AudioSegment
 from dask.distributed import progress, Client
 from s3fs.core import S3FileSystem
 
 from stem_continuation_dataset_generator.cluster import get_client
 from stem_continuation_dataset_generator.constants import DEFAULT_STEM_NAME, get_merged_files_path, get_original_files_path
 from stem_continuation_dataset_generator.utils.constants import get_random_seed
+from stem_continuation_dataset_generator.utils.utils import is_mostly_silent
 
 STEM_NAMES = ['guitar', 'drum', 'bass', 'perc', 'fx', 'vocals', 'piano', 'synth', 'winds', 'strings']
 BASIC_STEM_NAMES = ['guitar', 'drum', 'bass', 'perc', 'gtr', 'drm', 'piano']
@@ -133,22 +133,17 @@ def create_stems_assortments(other_stems: List[StemFile], current_stem_file: str
     return [(current_stem_file, assortment) for assortment in assortments]
 
 
-def is_mostly_silent(fs: S3FileSystem, file_path: str) -> bool:
-    with fs.open(file_path, 'rb') as file:
-        
-        audio, sr = librosa.load(file)  # type: ignore
-        no_of_samples = audio.shape[-1]
-        splits = librosa.effects.split(audio, top_db=60)
-        non_silent_samples = sum([end - start for (start, end) in splits])
-        return non_silent_samples / no_of_samples < MIN_PERCENTAGE_OF_AUDIO_IN_NON_SILENT_FILES
-
-
 def get_stem(file_path: str, silent: bool) -> StemFile:
     return StemFile(file_path=file_path, is_mostly_silent=silent)
 
 
+def is_remote_file_mostly_silent(fs: S3FileSystem, file_path: str):
+    with fs.open(file_path, 'rb') as file:
+        return is_mostly_silent(cast(io.TextIOWrapper, file), MIN_PERCENTAGE_OF_AUDIO_IN_NON_SILENT_FILES)
+
+
 def get_stems(fs: S3FileSystem, paths: List[str]) -> List[StemFile]:
-    return [get_stem(path, is_mostly_silent(fs, path)) for path in paths]
+    return [get_stem(path, is_remote_file_mostly_silent(fs, path)) for path in paths]
 
 
 def assort(fs: S3FileSystem, directory: str, stem_name: str) -> List[List[Tuple[str, FrozenSet[str]]]]:
diff --git a/src/stem_continuation_dataset_generator/steps/stem.py b/src/stem_continuation_dataset_generator/steps/stem.py
@@ -0,0 +1,90 @@
+import glob
+import os
+import shlex
+import tempfile
+from typing import List, Tuple, cast
+
+from distributed import Client, progress
+import demucs.separate
+from s3fs.core import S3FileSystem
+
+from stem_continuation_dataset_generator.cluster import get_client
+from stem_continuation_dataset_generator.constants import get_original_files_path, get_whole_tracks_files_path
+from stem_continuation_dataset_generator.steps.convert_to_ogg import convert_to_ogg
+from stem_continuation_dataset_generator.utils.utils import is_mostly_silent
+
+
+RUN_LOCALLY = False
+PERCENTAGE_OF_NON_SILENT_AUDIO_FILE = 0.25
+EXCLUDED_STEMS = ['piano', 'vocals']  # Piano and vocals stems produced by Demucs are low quality 
+
+
+def get_whole_track_files(fs: S3FileSystem, dir: str) -> List[str]:
+    return cast(List[str], fs.glob(os.path.join(dir, '**/*.mp3')))
+
+
+def stem_file(output_directory: str, file_path: str) -> tuple[str, list[tuple[str, str]]]:
+    """
+    Separates an audio file into its individual tracks using the Demucs model.
+
+    This function takes an audio file as input, separates it into its individual tracks using the Demucs model,
+    and returns the directory where the separated tracks are stored along with a list of tuples containing the
+    instrument name of each track and its corresponding file path.
+
+    Args:
+        filename (str): The path to the audio file to be separated.
+
+    Returns:
+        tuple[str, list[tuple[str, str]]]: A tuple containing the directory path where the separated tracks are stored,
+        and a list of tuples where each tuple contains the instrument name of a track and its file path.
+    """
+    demucs.separate.main(shlex.split(f'-n htdemucs_6s --clip-mode clamp --out "{output_directory}" "{file_path}"'))
+    return (output_directory, [(os.path.splitext(os.path.basename(filename))[0], filename) for filename in glob.glob(os.path.join(output_directory, '**', '*.wav'), recursive=True)])
+
+
+def stem(params: Tuple[S3FileSystem, str, str, str, str]):
+    fs, file_path, artist, source_directory, base_output_directory = params
+
+    basename = os.path.basename(file_path)
+    song_name = basename.replace('.mp3', '')
+    output_directory = os.path.join(base_output_directory, artist, song_name)
+
+    with tempfile.TemporaryDirectory() as local_directory:
+        local_path = os.path.join(local_directory, basename)
+        fs.download(file_path, local_path)
+        stem_file(local_directory, local_path)
+        os.remove(local_path)
+        convert_to_ogg(local_directory)
+        ogg_files = glob.glob(os.path.join(local_directory, '**/*.ogg'), recursive=True)
+        for ogg_file in ogg_files:
+            if os.path.basename(ogg_file).split('.')[0] not in EXCLUDED_STEMS:
+                with open(ogg_file, 'rb') as file:
+                    if not is_mostly_silent(file, PERCENTAGE_OF_NON_SILENT_AUDIO_FILE):
+                        print(ogg_file)
+                        fs.upload(ogg_file, os.path.join(output_directory, os.path.basename(ogg_file)))
+
+
+def stem_all(source_directory: str, output_directory: str):
+
+    fs = S3FileSystem()
+    files = get_whole_track_files(fs, source_directory)
+    files_with_artist = [(file_path, os.path.dirname(file_path).split(os.path.sep)[-1]) for file_path in files]
+
+    client = cast(
+        Client,
+        get_client(
+            RUN_LOCALLY,
+        ),
+    )
+    
+    params_list: List[Tuple[S3FileSystem, str, str, str, str]] = [(fs, file_path, artist, source_directory, output_directory) for file_path, artist in files_with_artist]
+
+    print('Stemming audio tracks')
+    futures = client.map(stem, params_list, retries=2)
+    progress(futures)
+
+    return output_directory
+
+
+if __name__ == '__main__':
+    stem_all(get_whole_tracks_files_path(), get_original_files_path())
diff --git a/src/stem_continuation_dataset_generator/utils/utils.py b/src/stem_continuation_dataset_generator/utils/utils.py
@@ -1,5 +1,8 @@
+import io
 from clearml import Dataset
 import numpy as np
+import librosa
+from typing import Union
 
 from stem_continuation_dataset_generator.constants import CLEARML_DATASET_NAME
 from stem_continuation_dataset_generator.utils.constants import get_clearml_project_name
@@ -40,3 +43,10 @@ def convert_audio_to_float_32(audio_data: np.ndarray) -> np.ndarray:
     raw_data = audio_data / max_32bit
     return raw_data.astype(np.float32)
 
+
+def is_mostly_silent(file: Union[io.TextIOWrapper, io.BufferedReader], percentage_non_silent: float) -> bool:        
+    audio, sr = librosa.load(file)  # type: ignore
+    no_of_samples = audio.shape[-1]
+    splits = librosa.effects.split(audio, top_db=60)
+    non_silent_samples = sum([end - start for (start, end) in splits])
+    return non_silent_samples / no_of_samples < percentage_non_silent