@@ -74,15 +74,30 @@ def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,
74
74
separate_last_epoch = (
75
75
last_epoch_num_samples < int (0.80 * num_samples_per_epoch ))
76
76
# Note. len(doc_idx) = num_epochs * len(doc)
77
+ start_time = time .time ()
77
78
doc_idx = _build_doc_idx (documents , num_epochs , np_rng ,
78
79
separate_last_epoch )
79
80
np .save (doc_idx_filename , doc_idx , allow_pickle = True )
80
-
81
+ print (' > elasped time to build and save doc-idx mapping '
82
+ '(seconds): {:4f}' .format (time .time () - start_time ))
81
83
# sample-idx. pos of each seq_len of data.
84
+ start_time = time .time ()
82
85
assert doc_idx .dtype == np .int32
83
- sample_idx = _build_sample_idx (sizes , doc_idx , seq_length ,
84
- num_epochs , tokens_per_epoch )
86
+ assert sizes .dtype == np .int32
87
+
88
+ import data_tools .helpers as helpers
89
+
90
+ sample_idx = helpers .build_sample_idx (sizes , doc_idx , seq_length ,
91
+ num_epochs , tokens_per_epoch )
92
+ # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
93
+ # num_epochs, tokens_per_epoch)
94
+
85
95
np .save (sample_idx_filename , sample_idx , allow_pickle = True )
96
+ print (' > elasped time to build and save sample-idx mapping '
97
+ '(seconds): {:4f}' .format (time .time () - start_time ))
98
+
99
+ # shuffle-idx.
100
+ start_time = time .time ()
86
101
87
102
if separate_last_epoch :
88
103
num_samples_ = num_samples_from_epochs_minus_one
@@ -93,14 +108,25 @@ def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,
93
108
shuffle_idx = _build_shuffle_idx (num_samples_ ,
94
109
sample_idx .shape [0 ] - 1 , np_rng )
95
110
np .save (shuffle_idx_filename , shuffle_idx , allow_pickle = True )
111
+ print (' > elasped time to build and save shuffle-idx mapping'
112
+ ' (seconds): {:4f}' .format (time .time () - start_time ))
113
+
96
114
else :
97
115
while True :
98
116
if (not os .path .isfile (doc_idx_filename )) or \
99
117
(not os .path .isfile (sample_idx_filename )) or \
100
118
(not os .path .isfile (shuffle_idx_filename )):
101
119
time .sleep (3 )
102
120
else :
103
- break
121
+ try :
122
+ np .load (
123
+ shuffle_idx_filename , allow_pickle = True , mmap_mode = 'r' )
124
+ break
125
+ except Exception as e :
126
+ print (
127
+ "%s file is still writing or damaged, please wait a moment."
128
+ % shuffle_idx_filename )
129
+ time .sleep (3 )
104
130
105
131
# Restore random state
106
132
np_rng .set_state (savedState )
@@ -241,19 +267,48 @@ def create_pretrained_dataset(args,
241
267
max_seq_len = 1024 ,
242
268
places = None ,
243
269
data_holders = None ):
270
+ if local_rank == 0 :
271
+ start_time = time .time ()
272
+ print ('> compiling dataset index builder ...' )
273
+ from data_tools .dataset_utils import compile_helper
274
+ compile_helper ()
275
+ print (
276
+ '>>> done with dataset index builder. Compilation time: {:.3f} '
277
+ 'seconds' .format (time .time () - start_time ),
278
+ flush = True )
279
+
244
280
device_world_size = paddle .distributed .get_world_size ()
245
281
device_world_rank = paddle .distributed .get_rank ()
246
282
247
283
logger .info (
248
284
"The distributed run, total device num:{}, distinct dataflow num:{}." .
249
285
format (device_world_size , data_world_size ))
250
286
251
- process_datas = np .load (input_path , mmap_mode = "r+" , allow_pickle = True )
252
- # All documment ids, extend as 1-D array.
253
- sample_ids = process_datas ["ids" ]
254
- # The len(sample_lens) num of docs
255
- # The sum(sample_lens) should equal len(sample_ids)
256
- sample_lens = process_datas ["lens" ]
287
+ assert len (input_path ) == 1 , "GPT only support one dataset for now."
288
+
289
+ input_prefix = input_path [0 ]
290
+
291
+ if os .path .isfile (input_prefix + "_ids.npz" ):
292
+ logger .warning (
293
+ "You are using compatible dataset, please make new dataset as the readme!"
294
+ )
295
+ process_datas = np .load (
296
+ input_prefix + "_ids.npz" , mmap_mode = "r+" , allow_pickle = True )
297
+ sample_ids = process_datas ["ids" ]
298
+ sample_lens = process_datas ["lens" ].astype ("int32" )
299
+ else :
300
+ for suffix in ["_ids.npy" , "_idx.npz" ]:
301
+ if not os .path .isfile (input_prefix + suffix ):
302
+ raise ValueError ("File Not found, %s" % (path + suffix ))
303
+
304
+ sample_ids = np .load (
305
+ input_prefix + "_ids.npy" , mmap_mode = "r" , allow_pickle = True )
306
+ # All documment ids, extend as 1-D array.
307
+
308
+ process_datas = np .load (input_prefix + "_idx.npz" )
309
+ # The len(sample_lens) num of docs
310
+ # The sum(sample_lens) should equal len(sample_ids)
311
+ sample_lens = process_datas ["lens" ]
257
312
258
313
splits = get_train_valid_test_split_ (args .split , len (sample_lens ))
259
314
assert len (sample_lens ) >= splits [
@@ -262,7 +317,7 @@ def create_pretrained_dataset(args,
262
317
263
318
def build_dataset (index , name , num_samples ):
264
319
dataset = GPTDataset (
265
- file_path = input_path ,
320
+ file_path = input_prefix ,
266
321
build_data_file = local_rank == 0 ,
267
322
name = "gpt_" + name ,
268
323
max_seq_len = max_seq_len ,
0 commit comments