- Added retries to dask steps

energydrink9 · energydrink9 · commit 61b84274f734 · 2024-11-26T19:45:13.000+11:00
- Updated README with development information
diff --git a/README.md b/README.md
@@ -65,3 +65,23 @@ poetry run python -m stem_continuation_dataset_generator.process <STEM_NAME>
 ```
 
 The pipeline will augment, distort, encode and split the samples into chunks, generating three different folders for the train, validation and test sets. The result will be uploaded to ClearML into 3 different datasets.
+
+### Development
+
+Download the repository and install the package:
+
+```sh
+git clone https://github.com/energydrink9/stem_continuation_dataset_generator.git
+cd stem_continuation_dataset_generator
+poetry install
+```
+
+Once you've downloaded the repository and installed the package, please run the following command to setup the pre-commit hooks:
+```sh
+pre-commit install
+```
+
+Please run the tests before submitting a PR:
+```sh
+pytest
+```
diff --git a/entrypoint.sh b/entrypoint.sh
diff --git a/src/stem_continuation_dataset_generator/cluster.py b/src/stem_continuation_dataset_generator/cluster.py
@@ -3,6 +3,8 @@
 import dask.config
 from dask.distributed import Client, LocalCluster
 
+from stem_continuation_dataset_generator.constants import DASK_CLUSTER_NAME
+
 NUM_WORKERS = [4, 50]
 BUCKET = 's3://stem-continuation-dataset'
 
@@ -16,10 +18,11 @@ def get_client(
     dask.config.set({'distributed.scheduler.allowed-failures': 12})
 
     if run_locally is True:
-        cluster = LocalCluster(n_workers=2, threads_per_worker=1, **kwargs)
+        cluster = LocalCluster(n_workers=2, threads_per_worker=1)
 
     else:
         cluster = coiled.Cluster(
+            name=DASK_CLUSTER_NAME,
             n_workers=n_workers,
             package_sync_conda_extras=['portaudio', 'ffmpeg'],
             idle_timeout="5 minutes",
diff --git a/src/stem_continuation_dataset_generator/codec.py b/src/stem_continuation_dataset_generator/codec.py
@@ -11,7 +11,7 @@
 
 from stem_continuation_dataset_generator.utils.device import Device
 
-ENCODER_BATCH_SIZE = 32
+ENCODER_BATCH_SIZE = 1
 ENCODED_TOKENS_PER_CHUNK = 512  # large values (over 1024) require a large amount of memory and can produce OOM errors
 
 
@@ -28,10 +28,10 @@ def get_processor(device: Device):
     return AutoProcessor.from_pretrained("facebook/encodec_32khz", device_map=device)
 
 
-def encode_file(audio_path: Union[BinaryIO, str, PathLike], device: Device, format: Optional[str] = None) -> Tuple[Tensor, float]:
+def encode_file(audio_path: Union[BinaryIO, str, PathLike], device: Device, format: Optional[str] = None, batch_size: int = ENCODER_BATCH_SIZE) -> Tuple[Tensor, float]:
     # Load and pre-process the audio waveform
     wav, sr = torchaudio.load(audio_path, format=format, normalize=False)  # Normalization is later performed using librosa as it seems to work better
-    return encode(wav, sr, device)
+    return encode(wav, sr, device, batch_size=batch_size)
 
 
 def get_total_chunks(samples_per_chunk: int, num_samples: int) -> int:
@@ -57,7 +57,7 @@ def chunk_list(lst, n: int):
         yield lst[i:i + n]
 
 
-def encode(audio: Tensor, sr: int, device: Device) -> Tuple[Tensor, float]:
+def encode(audio: Tensor, sr: int, device: Device, batch_size: int = ENCODER_BATCH_SIZE) -> Tuple[Tensor, float]:
 
     device = device if not device.startswith('mps') else 'cpu'  # Encoding is not supported on MPS
     processor = get_processor(device)
@@ -85,7 +85,7 @@ def encode(audio: Tensor, sr: int, device: Device) -> Tuple[Tensor, float]:
     encoded_chunks = []
 
     # create audio chunks
-    batches: List[List[Tensor]] = list(chunk_list(chunks, ENCODER_BATCH_SIZE))
+    batches: List[List[Tensor]] = list(chunk_list(chunks, batch_size))
 
     for batch in batches:
         inputs = processor(raw_audio=batch, sampling_rate=processor.sampling_rate, return_tensors="pt")
diff --git a/src/stem_continuation_dataset_generator/constants.py b/src/stem_continuation_dataset_generator/constants.py
@@ -5,6 +5,7 @@
 CLEARML_DATASET_TRAINING_VERSION = '1.0.0'
 DEFAULT_STEM_NAME = 'drum'
 STORAGE_BUCKET_NAME = 'stem-continuation-dataset'
+DASK_CLUSTER_NAME = 'stem-continuation-dataset-generator-cluster'
 
 
 def get_original_files_path():
diff --git a/src/stem_continuation_dataset_generator/steps/augment.py b/src/stem_continuation_dataset_generator/steps/augment.py
@@ -1,6 +1,5 @@
 import io
 import os
-import traceback
 from typing import Any, List, Tuple, cast
 from dask.distributed import Client
 from distributed import progress
@@ -69,47 +68,42 @@ def augment(params: Tuple[S3FileSystem, str, str, str]) -> None:
     
     fs, file_path, source_directory, output_directory = params
 
-    try:
-        file_dir = os.path.dirname(file_path)
-        stem_file_path = os.path.join(file_dir, 'stem.ogg')
+    file_dir = os.path.dirname(file_path)
+    stem_file_path = os.path.join(file_dir, 'stem.ogg')
+    file_dir = os.path.dirname(file_path)
+    relative_path = os.path.relpath(file_dir, source_directory)
+    output_file_path = os.path.join(output_directory, relative_path + '-original')
+
+    full_track_output_file_path = os.path.join(output_file_path, os.path.basename(file_path))
+    
+    if not fs.exists(full_track_output_file_path):
+        fs.makedirs(os.path.dirname(full_track_output_file_path), exist_ok=True)
+        if fs.exists(file_path):
+            fs.copy(file_path, full_track_output_file_path)
+
+    stem_output_file_path = os.path.join(output_file_path, os.path.basename(stem_file_path))
+
+    if not fs.exists(stem_output_file_path):
+        fs.makedirs(os.path.dirname(full_track_output_file_path), exist_ok=True)
+        if fs.exists(stem_file_path):
+            fs.copy(stem_file_path, stem_output_file_path)
+
+    for i in range(AUGMENTATIONS_COUNT):
         file_dir = os.path.dirname(file_path)
         relative_path = os.path.relpath(file_dir, source_directory)
-        output_file_path = os.path.join(output_directory, relative_path + '-original')
-
+        output_file_path = os.path.join(output_directory, relative_path + f'-augmented{i}')
         full_track_output_file_path = os.path.join(output_file_path, os.path.basename(file_path))
-        
-        if not fs.exists(full_track_output_file_path):
-            fs.makedirs(os.path.dirname(full_track_output_file_path), exist_ok=True)
-            if fs.exists(file_path):
-                fs.copy(file_path, full_track_output_file_path)
-
         stem_output_file_path = os.path.join(output_file_path, os.path.basename(stem_file_path))
 
-        if not fs.exists(stem_output_file_path):
-            fs.makedirs(os.path.dirname(full_track_output_file_path), exist_ok=True)
-            if fs.exists(stem_file_path):
-                fs.copy(stem_file_path, stem_output_file_path)
-
-        for i in range(AUGMENTATIONS_COUNT):
-            file_dir = os.path.dirname(file_path)
-            relative_path = os.path.relpath(file_dir, source_directory)
-            output_file_path = os.path.join(output_directory, relative_path + f'-augmented{i}')
-            full_track_output_file_path = os.path.join(output_file_path, os.path.basename(file_path))
-            stem_output_file_path = os.path.join(output_file_path, os.path.basename(stem_file_path))
-
-            if not fs.exists(full_track_output_file_path) or not fs.exists(stem_output_file_path):
-                fs.makedirs(output_file_path, exist_ok=True)
-                augment_pitch_and_tempo(
-                    fs,
-                    [
-                        (file_path, full_track_output_file_path),
-                        (stem_file_path, stem_output_file_path)
-                    ]
-                )
-
-    except Exception as e:
-        print(f'Error augmenting file {file_path}: {e}')
-        print(traceback.format_exc())
+        if not fs.exists(full_track_output_file_path) or not fs.exists(stem_output_file_path):
+            fs.makedirs(output_file_path, exist_ok=True)
+            augment_pitch_and_tempo(
+                fs,
+                [
+                    (file_path, full_track_output_file_path),
+                    (stem_file_path, stem_output_file_path)
+                ]
+            )
 
 
 def augment_all(source_directory: str, output_directory: str):
@@ -127,7 +121,7 @@ def augment_all(source_directory: str, output_directory: str):
     params_list: List[Tuple[S3FileSystem, str, str, str]] = [(fs, file_path, source_directory, output_directory) for file_path in files]
 
     print('Augmenting audio tracks')
-    futures = client.map(augment, params_list)
+    futures = client.map(augment, params_list, retries=2)
     progress(futures)
 
     return output_directory
diff --git a/src/stem_continuation_dataset_generator/steps/distort.py b/src/stem_continuation_dataset_generator/steps/distort.py
@@ -1,6 +1,5 @@
 import io
 import os
-import traceback
 from typing import List, Tuple, cast
 from fsspec import AbstractFileSystem
 import numpy as np
@@ -78,24 +77,19 @@ def distort(params: Tuple[S3FileSystem, Tuple[str, str], str, str]) -> None:
 
     fs, (full_track_file_path, stem_file_path), source_directory, output_directory = params
 
-    try:
-        file_dir = os.path.dirname(full_track_file_path)
-        full_track_relative_path = os.path.relpath(file_dir, source_directory)
-        actual_output_dir = os.path.join(output_directory, full_track_relative_path)
-        fs.makedirs(actual_output_dir, exist_ok=True)
-        full_track_output_file_path = os.path.join(actual_output_dir, os.path.basename(full_track_file_path))
+    file_dir = os.path.dirname(full_track_file_path)
+    full_track_relative_path = os.path.relpath(file_dir, source_directory)
+    actual_output_dir = os.path.join(output_directory, full_track_relative_path)
+    fs.makedirs(actual_output_dir, exist_ok=True)
+    full_track_output_file_path = os.path.join(actual_output_dir, os.path.basename(full_track_file_path))
 
-        if not fs.exists(full_track_output_file_path):
-            distort_file(fs, full_track_file_path, full_track_output_file_path)
+    if not fs.exists(full_track_output_file_path):
+        distort_file(fs, full_track_file_path, full_track_output_file_path)
 
-        stem_relative_path = os.path.relpath(stem_file_path, source_directory)
-        stem_output_file_path = os.path.join(output_directory, stem_relative_path)
-        if not fs.exists(stem_output_file_path):
-            fs.copy(stem_file_path, stem_output_file_path)
-    
-    except Exception as e:
-        print(f'Error processing {full_track_file_path} or {stem_file_path}: {e}')
-        print(traceback.format_exc())
+    stem_relative_path = os.path.relpath(stem_file_path, source_directory)
+    stem_output_file_path = os.path.join(output_directory, stem_relative_path)
+    if not fs.exists(stem_output_file_path):
+        fs.copy(stem_file_path, stem_output_file_path)
 
 
 def distort_all(source_directory: str, output_directory: str):
@@ -110,7 +104,7 @@ def distort_all(source_directory: str, output_directory: str):
     ))
     
     print('Distorting audio tracks')
-    futures = client.map(distort, params_list)
+    futures = client.map(distort, params_list, retries=2)
     progress(futures)
 
     return output_directory
diff --git a/src/stem_continuation_dataset_generator/steps/encode.py b/src/stem_continuation_dataset_generator/steps/encode.py
@@ -1,6 +1,5 @@
 import os
 import pickle
-import traceback
 from typing import List, Tuple, cast
 from distributed import Client, progress
 from s3fs.core import S3FileSystem
@@ -22,26 +21,22 @@ def encode(params: Tuple[S3FileSystem, str, str, str]):
     fs, file_path, source_directory, output_directory = params
     device = get_device()
 
-    try:
-        file_dir = os.path.dirname(file_path)
-        relative_path = os.path.relpath(file_dir, source_directory)
-        file_output_directory = os.path.join(output_directory, relative_path)
-        fs.makedirs(file_output_directory, exist_ok=True)
+    file_dir = os.path.dirname(file_path)
+    relative_path = os.path.relpath(file_dir, source_directory)
+    file_output_directory = os.path.join(output_directory, relative_path)
 
-        output_filename = os.path.basename(file_path)
-        output_file_path = os.path.join(file_output_directory, output_filename)
+    output_filename = os.path.basename(file_path).split('.')[0] + '.pkl'
+    output_file_path = os.path.join(file_output_directory, output_filename)
 
-        if not fs.exists(output_file_path):
-            with fs.open(file_path, 'rb') as file:
-                encoded_audio, frame_rate = encode_file(file, device)                
+    if not fs.exists(output_file_path):
+        with fs.open(file_path, 'rb') as file:
+            encoded_audio, frame_rate = encode_file(file, device, batch_size=2)                
 
-                if not fs.exists(output_file_path):
-                    with fs.open(output_file_path, 'wb') as output_file:
-                        pickle.dump(encoded_audio.detach().to('cpu'), output_file)
-
-    except Exception:
-        print(f'Error while encoding file {file_path}')
-        print(traceback.format_exc())
+            fs.makedirs(file_output_directory, exist_ok=True)
+            with fs.open(output_file_path, 'wb') as output_file:
+                pickle.dump(encoded_audio.detach().to('cpu'), output_file)
+    else:
+        print(f'path {output_file_path} already exists')
     
 
 def encode_all(source_directory: str, output_directory: str):
@@ -53,7 +48,6 @@ def encode_all(source_directory: str, output_directory: str):
     client = cast(Client, get_client(
         RUN_LOCALLY,
         n_workers=[1, 1],
-        # worker_vm_types=['c6a.xlarge'],
         worker_vm_types=['g4dn.xlarge'],
         scheduler_vm_types=['t3.medium'],
         spot_policy='spot',
@@ -66,7 +60,7 @@ def encode_all(source_directory: str, output_directory: str):
     #     print(f'Processing {i} of {len(params_list)} {round(cast(float, i) / len(params_list) * 100)}')
     #     encode(params_list[i])
 
-    futures = client.map(encode, params_list)
+    futures = client.map(encode, params_list, retries=2, batch_size=8)
     progress(futures)
 
     return output_directory
diff --git a/src/stem_continuation_dataset_generator/steps/merge.py b/src/stem_continuation_dataset_generator/steps/merge.py
@@ -218,7 +218,7 @@ def assort_and_merge_all(source_directory: str, output_directory: str, stem_name
     params_list: List[Tuple[S3FileSystem, str, str, str, str]] = [(fs, source_directory, output_directory, directory, stem_name) for directory in dirs]
 
     print('Assorting and merging audio tracks')
-    progress(client.map(assort_directory, params_list))
+    progress(client.map(assort_directory, params_list, retries=2))
 
     return output_directory