-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Open
Description
Describe the bug
Im trying to run a fairly simple map that is converting a dataset into numpy arrays. however, it just piles up on memory and doesnt write to disk. Ive tried multiple cache techniques such as specifying the cache dir, setting max mem, +++ but none seem to work. What am I missing here?
Steps to reproduce the bug
from pydub import AudioSegment
import io
import base64
import numpy as np
import os
CACHE_PATH = "/mnt/extdisk/cache" # "/root/.cache/huggingface/"#
os.environ["HF_HOME"] = CACHE_PATH
import datasets
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# Create a handler for Jupyter notebook
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
#datasets.config.IN_MEMORY_MAX_SIZE= 1000#*(2**30) #50 gb
print(datasets.config.HF_CACHE_HOME)
print(datasets.config.HF_DATASETS_CACHE)
# Decode the base64 string into bytes
def convert_mp3_to_audio_segment(example):
"""
example = ds['train'][0]
"""
try:
audio_data_bytes = base64.b64decode(example['audio'])
# Use pydub to load the MP3 audio from the decoded bytes
audio_segment = AudioSegment.from_file(io.BytesIO(audio_data_bytes), format="mp3")
# Resample to 24_000
audio_segment = audio_segment.set_frame_rate(24_000)
audio = {'sampling_rate' : audio_segment.frame_rate,
'array' : np.array(audio_segment.get_array_of_samples(), dtype="float")}
del audio_segment
duration = len(audio['array']) / audio['sampling_rate']
except Exception as e:
logger.warning(f"Failed to convert audio for {example['id']}. Error: {e}")
audio = {'sampling_rate' : 0,
'array' : np.array([]), duration : 0}
return {'audio' : audio, 'duration' : duration}
ds = datasets.load_dataset("NbAiLab/nb_distil_speech_noconcat_stortinget", cache_dir=CACHE_PATH, keep_in_memory=False)
#%%
num_proc=32
ds_processed = (
ds
#.select(range(10))
.map(convert_mp3_to_audio_segment, num_proc=num_proc, desc="Converting mp3 to audio segment") #, cache_file_name=f"{CACHE_PATH}/stortinget_audio" # , cache_file_name="test"
)
Expected behavior
the map should write to disk
Environment info
datasetsversion: 3.2.0- Platform: Linux-6.8.0-45-generic-x86_64-with-glibc2.39
- Python version: 3.12.7
huggingface_hubversion: 0.26.3- PyArrow version: 18.1.0
- Pandas version: 2.2.3
fsspecversion: 2024.9.0
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels