dlio_benchmark/src/data_generator/bin_generator.py at b894e99770b08e51f6ab180527320e640e3dfdce · argonne-lcf/dlio_benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""
The binary file generator designed for simulating DLRM in DLIO
"""

from src.common.enumerations import Compression
from src.data_generator.data_generator import DataGenerator

import logging
import numpy as np
from numpy import random
import math
import os

from src.utils.utility import progress
from shutil import copyfile

"""
Generator for creating data in BIN format.
"""

class BINGenerator(DataGenerator):
    def __init__(self):
        super().__init__()

    def generate(self):
        """
        Generate binary data for training and testing.
        """
        super().generate()

        for i in range(self.my_rank, int(self.total_files_to_generate), self.comm_size):
            progress(i+1, self.total_files_to_generate, "Generating Binary Data")
            out_path_spec = self.storage.get_uri(self._file_list[i])
            # File size will be different depending on training or validation file
            if i < self.num_files_train:
                # Generating Training files
                segment_size = 91681240*5
                num_instance = self.num_samples #4195198976 for dlrm training
                parts = math.ceil(num_instance / segment_size)
                for k in range(0, parts):
                    num_written = segment_size if k < parts-1 else num_instance - k*segment_size
                    X_int = np.random.randint(2557264, size = (num_written, 13))
                    X_cat = np.random.randint(8831335, size = (num_written, 26))
                    y = np.random.randint(2, size=num_written)
                    np_data = np.concatenate([y.reshape(-1, 1), X_int, X_cat], axis=1)
                    np_data = np_data.astype(np.int32)
                    if self.compression != Compression.ZIP:
                        with open(out_path_spec, 'ab') as output_file:
                            output_file.write(np_data.tobytes())
                            output_file.flush()
                            os.fsync(output_file.fileno())
            else:
                # Generating Evaluation files

                #### Old implementation that flushes file written at the end
                #
                # num_instance = self.eval_num_samples_per_file # estimated as 6548660*14
                # X_int = np.random.randint(2557264, size = (num_instance, 13))
                # X_cat = np.random.randint(8831335, size = (num_instance, 26))
                # y = np.random.randint(2, size=num_instance)
                # np_data = np.concatenate([y.reshape(-1, 1), X_int, X_cat], axis=1)
                # np_data = np_data.astype(np.int32)
                # if self.compression != Compression.ZIP:
                #     with open(out_path_spec, 'wb') as output_file:
                #         output_file.write(np_data.tobytes())

                segment_size = 91681240*5
                num_instance = self.eval_num_samples_per_file #4195198976 for dlrm training
                parts = math.ceil(num_instance / segment_size)
                for k in range(0, parts):
                    num_written = segment_size if k < parts-1 else num_instance - k*segment_size
                    X_int = np.random.randint(2557264, size = (num_written, 13))
                    X_cat = np.random.randint(8831335, size = (num_written, 26))
                    y = np.random.randint(2, size=num_written)
                    np_data = np.concatenate([y.reshape(-1, 1), X_int, X_cat], axis=1)
                    np_data = np_data.astype(np.int32)
                    if self.compression != Compression.ZIP:
                        with open(out_path_spec, 'ab') as output_file:
                            output_file.write(np_data.tobytes())
                            output_file.flush()
                            os.fsync(output_file.fileno())