Skip to content

Commit 0a6130a

Browse files
added checkpointing to support LLMs (#114)
* added checkpointing to support LLMs * added indexed binary data support for LLMs. * added configuration for megatron deepspeed. * fixes for out of core data generation * fixes for out of core data generation * fixes for out of core data generation * added dlrm configuration * added changes to support mmapped file. * added changes to support mmapped file. * added changes to support mmapped file. * added changes to support mmapped file. * added changes to support mmapped file. * fixed checkpointing for tensors * Update torch_framework.py Fix rank for merge bug. * Update indexed_binary_generator.py Change GB to a abs value. * Update megatron_deepspeed.yaml * refactor enum for better naming * documentation for the checkpointing. * make data generation buffer_size configurable. * Update tf_framework.py Args model size * Update tf_framework.py * Update megatron_deepspeed.yaml * Update megatron_deepspeed.yaml * make data generation buffer_size configurable.
1 parent 0720984 commit 0a6130a

File tree

17 files changed

+647
-74
lines changed

17 files changed

+647
-74
lines changed

.github/workflows/python-package-conda.yml

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ jobs:
8181
mpirun -np 2 pytest -k test_gen_data[jpeg-tensorflow] -v
8282
mpirun -np 2 pytest -k test_gen_data[tfrecord-tensorflow] -v
8383
mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v
84+
mpirun -np 2 pytest -k test_gen_data[indexed_binary-tensorflow] -v
85+
mpirun -np 2 pytest -k test_gen_data[mmap_indexed_binary-tensorflow] -v
8486
- name: test_custom_storage_root_gen_data
8587
run: |
8688
source ${VENV}/bin/activate
@@ -89,6 +91,8 @@ jobs:
8991
mpirun -np 2 pytest -k test_storage_root_gen_data[jpeg-tensorflow] -v
9092
mpirun -np 2 pytest -k test_storage_root_gen_data[tfrecord-tensorflow] -v
9193
mpirun -np 2 pytest -k test_storage_root_gen_data[hdf5-tensorflow] -v
94+
mpirun -np 2 pytest -k test_storage_root_gen_data[indexed_binary-tensorflow] -v
95+
mpirun -np 2 pytest -k test_storage_root_gen_data[mmap_indexed_binary-tensorflow] -v
9296
- name: test_train
9397
run: |
9498
source ${VENV}/bin/activate
@@ -113,6 +117,14 @@ jobs:
113117
mpirun -np 2 pytest -k test_train[jpeg-pytorch-dali] -v
114118
mpirun -np 2 pytest -k test_train[hdf5-pytorch-dali] -v
115119
mpirun -np 2 pytest -k test_train[csv-pytorch-dali] -v
120+
mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-tensorflow] -v
121+
mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-pytorch] -v
122+
mpirun -np 2 pytest -k test_train[indexed_binary-tensorflow-dali] -v
123+
mpirun -np 2 pytest -k test_train[indexed_binary-pytorch-dali] -v
124+
mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-tensorflow] -v
125+
mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-pytorch] -v
126+
mpirun -np 2 pytest -k test_train[mmap_indexed_binary-tensorflow-dali] -v
127+
mpirun -np 2 pytest -k test_train[mmap_indexed_binary-pytorch-dali] -v
116128
- name: test_custom_storage_root_train
117129
run: |
118130
source ${VENV}/bin/activate
@@ -127,10 +139,19 @@ jobs:
127139
mpirun -np 2 pytest -k test_custom_storage_root_train[jpeg-pytorch] -v
128140
mpirun -np 2 pytest -k test_custom_storage_root_train[hdf5-pytorch] -v
129141
mpirun -np 2 pytest -k test_custom_storage_root_train[csv-pytorch] -v
142+
mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-tensorflow] -v
143+
mpirun -np 2 pytest -k test_custom_storage_root_train[indexed_binary-pytorch] -v
144+
mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-tensorflow] -v
145+
mpirun -np 2 pytest -k test_custom_storage_root_train[mmap_indexed_binary-pytorch] -v
130146
- name: test_checkpoint_epoch
131147
run: |
132148
source ${VENV}/bin/activate
133-
mpirun -np 2 pytest -k test_checkpoint_epoch -v
149+
mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers0-2-layer_params0-all_ranks] -v
150+
mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers1-2-layer_params1-all_ranks] -v
151+
mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers2-2-layer_params2-rank_zero] -v
152+
mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers3-2-layer_params3-rank_zero] -v
153+
mpirun -np 2 pytest -k test_checkpoint_epoch[tensorflow-1024-optimizers4-1-layer_params4-all_ranks] -v
154+
mpirun -np 2 pytest -k test_checkpoint_epoch[pytorch-1024-optimizers5-1-layer_params5-all_ranks] -v
134155
- name: test_checkpoint_step
135156
run: |
136157
source ${VENV}/bin/activate

dlio_benchmark/common/enumerations.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@
1717

1818
from enum import Enum
1919

20+
class CheckpointLocationType(Enum):
21+
"""
22+
Different types of underlying storage
23+
"""
24+
RANK_ZERO = 'rank_zero'
25+
ALL_RANKS = 'all_ranks'
26+
27+
def __str__(self):
28+
return self.value
29+
2030
class StorageType(Enum):
2131
"""
2232
Different types of underlying storage
@@ -97,6 +107,8 @@ class FormatType(Enum):
97107
HDF5_OPT = 'hdf5_opt'
98108
JPEG = 'jpeg'
99109
PNG = 'png'
110+
INDEXED_BINARY = 'indexed_binary'
111+
MMAP_INDEXED_BINARY = 'mmap_indexed_binary'
100112

101113
def __str__(self):
102114
return self.value
@@ -119,6 +131,10 @@ def get_enum(value):
119131
return FormatType.JPEG
120132
elif FormatType.PNG.value == value:
121133
return FormatType.PNG
134+
elif FormatType.INDEXED_BINARY.value == value:
135+
return FormatType.INDEXED_BINARY
136+
elif FormatType.MMAP_INDEXED_BINARY.value == value:
137+
return FormatType.MMAP_INDEXED_BINARY
122138

123139
class DataLoaderType(Enum):
124140
"""
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
model: dlrm
2+
3+
framework: pytorch
4+
5+
workflow:
6+
generate_data: False
7+
train: True
8+
do_eval: True
9+
10+
dataset:
11+
data_folder: data/dlrm
12+
format: indexed_binary
13+
num_files_train: 1
14+
num_files_eval: 1
15+
num_samples_per_file: 4195198976
16+
record_length: 327680
17+
keep_files: True
18+
eval_num_samples_per_file: 91681240
19+
20+
reader:
21+
data_loader: pytorch
22+
batch_size: 2048
23+
batch_size_eval: 16384
24+
sample_shuffle: random
25+
26+
train:
27+
epochs: 1
28+
computation_time: 0.064296
29+
total_training_steps: 32768
30+
total_eval_steps: 2048
31+
32+
evaluation:
33+
eval_time: 0.0843
34+
steps_between_evals: 16384
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# 8 node run with 4 GPUs per node and TPSIZE=4 and PPSIZE=8
2+
model: megatron_deepspeed
3+
4+
framework: pytorch
5+
6+
workflow:
7+
generate_data: False
8+
train: True
9+
checkpoint: True
10+
11+
dataset:
12+
data_folder: dataset/megatron-deepspeed/
13+
format: mmap_indexed_binary
14+
num_files_train: 1
15+
num_samples_per_file: 277203535
16+
record_length: 2048
17+
18+
reader:
19+
data_loader: pytorch
20+
batch_size: 1024
21+
read_threads: 1
22+
file_shuffle: seed
23+
sample_shuffle: seed
24+
25+
train:
26+
epochs: 311541
27+
computation_time: 0.03 # every iteration has 290 steps and each iteration is 8.9 sec.
28+
29+
checkpoint:
30+
checkpoint_folder: checkpoints/megatron-deepspeed
31+
steps_between_checkpoints: 1000
32+
model_size: 30102
33+
type: all_ranks
34+
optimization_groups: [1009254400, 865075200, 793600]
35+
num_layers: 44
36+
layer_parameters: [129761280, 20971520]

dlio_benchmark/data_generator/generator_factory.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,5 +47,8 @@ def get_generator(type):
4747
elif type == FormatType.PNG:
4848
from dlio_benchmark.data_generator.png_generator import PNGGenerator
4949
return PNGGenerator()
50+
elif type == FormatType.INDEXED_BINARY or type == FormatType.MMAP_INDEXED_BINARY:
51+
from dlio_benchmark.data_generator.indexed_binary_generator import IndexedBinaryGenerator
52+
return IndexedBinaryGenerator()
5053
else:
5154
raise Exception(str(ErrorCodes.EC1001))

dlio_benchmark/data_generator/hdf5_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def generate(self):
4444
"""
4545
super().generate()
4646
np.random.seed(10)
47-
samples_per_iter=max(1, int(32*1024*1024/self._args.record_length))
47+
samples_per_iter=max(1, int(self._args.generation_buffer_size/self._args.record_length))
4848
record_labels = [0] * self.num_samples
4949
for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)):
5050
progress(i, self.total_files_to_generate, "Generating HDF5 Data")
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""
2+
Copyright (c) 2022, UChicago Argonne, LLC
3+
All Rights Reserved
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
"""
17+
18+
from dlio_benchmark.common.enumerations import Compression
19+
from dlio_benchmark.data_generator.data_generator import DataGenerator
20+
21+
import logging
22+
import numpy as np
23+
24+
from dlio_benchmark.utils.utility import progress, utcnow
25+
from dlio_profiler.logger import fn_interceptor as Profile
26+
from shutil import copyfile
27+
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
28+
import struct
29+
30+
dlp = Profile(MODULE_DATA_GENERATOR)
31+
32+
"""
33+
Generator for creating data in NPZ format.
34+
"""
35+
class IndexedBinaryGenerator(DataGenerator):
36+
def __init__(self):
37+
super().__init__()
38+
39+
def index_file_path_off(self, prefix_path):
40+
return prefix_path + '.off.idx'
41+
42+
def index_file_path_size(self, prefix_path):
43+
return prefix_path + '.sz.idx'
44+
45+
@dlp.log
46+
def generate(self):
47+
"""
48+
Generator for creating data in NPZ format of 3d dataset.
49+
"""
50+
super().generate()
51+
np.random.seed(10)
52+
GB=1073741824
53+
for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)):
54+
dim1, dim2 = self.get_dimension()
55+
sample_size = dim1 * dim2
56+
total_size = sample_size * self.num_samples
57+
write_size = total_size
58+
memory_size = self._args.generation_buffer_size
59+
if total_size > memory_size:
60+
write_size = memory_size - (memory_size % sample_size)
61+
out_path_spec = self.storage.get_uri(self._file_list[i])
62+
out_path_spec_off_idx = self.index_file_path_off(out_path_spec)
63+
out_path_spec_sz_idx = self.index_file_path_size(out_path_spec)
64+
progress(i + 1, self.total_files_to_generate, "Generating Indexed Binary Data")
65+
prev_out_spec = out_path_spec
66+
written_bytes = 0
67+
data_file = open(out_path_spec, "wb")
68+
off_file = open(out_path_spec_off_idx, "wb")
69+
sz_file = open(out_path_spec_sz_idx, "wb")
70+
records = np.random.randint(255, size=write_size, dtype=np.uint8)
71+
while written_bytes < total_size:
72+
data_to_write = write_size if written_bytes + write_size <= total_size else total_size - written_bytes
73+
samples_to_write = data_to_write // sample_size
74+
75+
# Write data
76+
myfmt = 'B' * data_to_write
77+
binary_data = struct.pack(myfmt, *records[:data_to_write])
78+
data_file.write(binary_data)
79+
80+
# Write offsets
81+
myfmt = 'Q' * samples_to_write
82+
offsets = range(0, data_to_write, sample_size)
83+
offsets = offsets[:samples_to_write]
84+
binary_offsets = struct.pack(myfmt, *offsets)
85+
off_file.write(binary_offsets)
86+
87+
# Write sizes
88+
myfmt = 'Q' * samples_to_write
89+
sample_sizes = [sample_size] * samples_to_write
90+
binary_sizes = struct.pack(myfmt, *sample_sizes)
91+
sz_file.write(binary_sizes)
92+
93+
written_bytes = written_bytes + data_to_write
94+
data_file.close()
95+
off_file.close()
96+
sz_file.close()
97+
np.random.seed()

dlio_benchmark/framework/framework.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
"""
1717

1818
from abc import ABC, abstractmethod
19+
20+
from dlio_benchmark.common.enumerations import DatasetType
21+
from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory
22+
from dlio_benchmark.storage.storage_factory import StorageFactory
1923
from dlio_benchmark.utils.utility import utcnow
2024

2125
from time import sleep
@@ -40,11 +44,18 @@ def __init__(self):
4044
self.args = ConfigArguments.get_instance()
4145
self.output_folder = self.args.output_folder
4246
self.checkpoint_folder = self.args.checkpoint_folder
43-
pass
47+
4448

4549
@abstractmethod
46-
def init_loader(self, format_type, epoch_number, data_loader=None):
47-
pass
50+
def init_loader(self, format_type, epoch, data_loader=None):
51+
self.reader_train = DataLoaderFactory.get_loader(data_loader, format_type,
52+
dataset_type=DatasetType.TRAIN, epoch=epoch)
53+
self.reader_valid = DataLoaderFactory.get_loader(data_loader, format_type,
54+
dataset_type=DatasetType.VALID, epoch=epoch)
55+
self.storage = StorageFactory().get_storage(self.args.storage_type, self.args.storage_root, self.args.framework)
56+
checkpoint_storage = StorageFactory().get_storage(self.args.storage_type, self.checkpoint_folder,
57+
self.args.framework)
58+
checkpoint_storage.create_namespace(exist_ok=True)
4859

4960
@abstractmethod
5061
def get_type(self):

0 commit comments

Comments
 (0)