Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions dlio_benchmark/data_generator/npz_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
Copyright (c) 2025 Dell Inc, or its subsidiaries.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot have companies copyright here.

Copyright (c) 2024, UChicago Argonne, LLC
All Rights Reserved

Expand All @@ -20,11 +21,13 @@

import logging
import numpy as np
import io

from dlio_benchmark.utils.utility import progress, utcnow
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
from dlio_benchmark.common.enumerations import StorageType

dlp = Profile(MODULE_DATA_GENERATOR)

Expand All @@ -51,8 +54,19 @@ def generate(self):
out_path_spec = self.storage.get_uri(self._file_list[i])
progress(i+1, self.total_files_to_generate, "Generating NPZ Data")
prev_out_spec = out_path_spec
if self.compression != Compression.ZIP:
np.savez(out_path_spec, x=records, y=record_labels)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Create a new class which inherits NPZGenerator and switch on generator_factory based on storage type.

Copy link
Copy Markdown
Author

@ekaynar ekaynar Mar 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hariharan-devarajan, @zhenghh04 currently, GeneratorFactory only receives the format type (NPZ). Should I pass the storage type during the initialization, so that it can select either NPZS3Generator or NPZGenerator class? Or should I use a new format type called NPZS3?

if self._args.storage_type == StorageType.S3:
buffer = io.BytesIO()
if self.compression != Compression.ZIP:
np.savez(buffer, x=records, y=record_labels)
else:
np.savez_compressed(buffer, x=records, y=record_labels)
self.storage.put_data(out_path_spec, buffer)


else:
np.savez_compressed(out_path_spec, x=records, y=record_labels)
if self.compression != Compression.ZIP:
np.savez(out_path_spec, x=records, y=record_labels)
else:
np.savez_compressed(out_path_spec, x=records, y=record_labels)
np.random.seed()
9 changes: 9 additions & 0 deletions dlio_benchmark/reader/npz_reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
Copyright (c) 2025 Dell Inc, or its subsidiaries.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot have companies copyright here.

Copyright (c) 2024, UChicago Argonne, LLC
All Rights Reserved

Expand All @@ -16,9 +17,11 @@
"""
import numpy as np

import io
from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_benchmark.utils.utility import Profile
from dlio_benchmark.storage.s3_storage import S3PytorchStorage

dlp = Profile(MODULE_DATA_READER)

Expand All @@ -34,6 +37,12 @@ def __init__(self, dataset_type, thread_index, epoch):

@dlp.log
def open(self, filename):
if isinstance(self.storage, S3PytorchStorage):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to a new class that inherits NPZReader and overrides open function.

Copy link
Copy Markdown
Author

@ekaynar ekaynar Mar 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hariharan-devarajan Thank you for the feedback! I appreciate your suggestions. I will move everything into the NPZS3readers class.

My suggestion is that, in the long term, having a single reader class for different file types, which then directs to the appropriate storage class (file or object), might be a more sustainable approach compared to having separate readers for different protocol accesses. This way, we avoid duplicating code for different file types, and a unified structure can make the codebase easier to understand for other developers.

print(filename)
data = self.storage.get_data(filename)
image = io.BytesIO(data)
return np.load(image, allow_pickle=True)["x"]

super().open(filename)
return np.load(filename, allow_pickle=True)['x']

Expand Down
4 changes: 3 additions & 1 deletion dlio_benchmark/reader/reader_handler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
Copyright (c) 2025 Dell Inc, or its subsidiaries.
Copyright (c) 2024, UChicago Argonne, LLC
All Rights Reserved

Expand Down Expand Up @@ -44,7 +45,8 @@ def __init__(self, dataset_type, thread_index):
f"Loading {self.__class__.__qualname__} reader on thread {self.thread_index} from rank {self._args.my_rank}")
self.dataset_type = dataset_type
self.open_file_map = {}

self.storage = StorageFactory().get_storage(self._args.storage_type, self._args.storage_root,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this to new reader class' init function

self._args.framework)
if FormatReader.read_images is None:
FormatReader.read_images = 0
self.step = 1
Expand Down
84 changes: 83 additions & 1 deletion dlio_benchmark/storage/s3_storage.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
Copyright (c) 2025 Dell Inc, or its subsidiaries.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove company copyright.

Copyright (c) 2024, UChicago Argonne, LLC
All Rights Reserved

Expand All @@ -20,11 +21,92 @@
from dlio_benchmark.storage.storage_handler import DataStorage, Namespace
from dlio_benchmark.common.enumerations import NamespaceType, MetadataType
import os
import boto3
from botocore.exceptions import ClientError

from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_STORAGE)

class S3PytorchStorage(DataStorage):
"""
PyTorch Storage APIs for creating files.
It uses Boto3 client to read and write data
"""

@dlp.log_init
def __init__(self, namespace, framework=None):
super().__init__(framework)
self.namespace = Namespace(namespace, NamespaceType.FLAT)
self.s3_client = boto3.client('s3')


@dlp.log
def get_uri(self, id):
return id

@dlp.log
def create_namespace(self, exist_ok=False):
# Assume the S3 bucket is exist
return True

@dlp.log
def get_namespace(self):
return self.get_node(self.namespace.name)

@dlp.log
def create_node(self, id, exist_ok=False):
return super().create_node(self.get_uri(id), exist_ok)

@dlp.log
def get_node(self, id=""):
return super().get_node(self.get_uri(id))

@dlp.log
def walk_node(self, id, use_pattern=False):
return self.list_objects(self.namespace.name, id)

@dlp.log
def put_data(self, id, data, offset=None, length=None):
self.s3_client.put_object(Bucket=self.namespace.name, Key=id, Body=data.getvalue())
return None

@dlp.log
def get_data(self, id, offset=None, length=None):
obj_name = os.path.relpath(id)
if offset:
byte_range = f"bytes={offset}-{offset + length - 1}"
return self.s3_client.get_object(Bucket=self.namespace.name, Key=id, Range=byte_range)['Body'].read()
else:
return self.s3_client.get_object(Bucket=self.namespace.name, Key=obj_name)['Body'].read()


@dlp.log
def list_objects(self, bucket_name, prefix=None):
params = {'Bucket': bucket_name}
if prefix:
params['Prefix'] = prefix
paths = []
try:
## Need to implement pagination
response = self.s3_client.list_objects_v2(**params)

if 'Contents' in response:
for key in response['Contents']:
paths.append(key['Key'][len(prefix)+1:])
except self.s3_client.exceptions.NoSuchBucket:
print(f"Bucket '{bucket_name}' does not exist.")

return paths


@dlp.log
def delete_node(self, id):
return super().delete_node(self.get_uri(id))

def get_basename(self, id):
return os.path.basename(id)


class S3Storage(DataStorage):
"""
Expand Down Expand Up @@ -73,4 +155,4 @@ def get_data(self, id, data, offset=None, length=None):
return super().get_data(self.get_uri(id), data, offset, length)

def get_basename(self, id):
return os.path.basename(id)
return os.path.basename(id)
8 changes: 7 additions & 1 deletion dlio_benchmark/storage/storage_factory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""
Copyright (c) 2025 Dell Inc, or its subsidiaries.
Copyright (c) 2024, UChicago Argonne, LLC
All Rights Reserved

Expand All @@ -18,6 +19,8 @@
from dlio_benchmark.storage.s3_storage import S3Storage
from dlio_benchmark.common.enumerations import StorageType
from dlio_benchmark.common.error_code import ErrorCodes
from dlio_benchmark.common.enumerations import FrameworkType
from dlio_benchmark.storage.s3_storage import S3PytorchStorage

class StorageFactory(object):
def __init__(self):
Expand All @@ -28,6 +31,9 @@ def get_storage(storage_type, namespace, framework=None):
if storage_type == StorageType.LOCAL_FS:
return FileStorage(namespace, framework)
elif storage_type == StorageType.S3:
return S3Storage(namespace, framework)
if framework == FrameworkType.PYTORCH:
return S3PytorchStorage(namespace, framework)
else:
return S3Storage(namespace, framework)
else:
raise Exception(str(ErrorCodes.EC1001))
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ tensorflow>=2.11.0
torch>=2.2.0
torchaudio
torchvision
boto3
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"pandas>=1.5.1",
"psutil>=5.9.8",
"pydftracer==1.0.8",
"boto3",
]
x86_deps = [
f"hydra-core>={HYDRA_VERSION}",
Expand Down
Loading