|
32 | 32 | import argparse |
33 | 33 | import copy |
34 | 34 | import os |
35 | | -import pickle |
36 | 35 | from collections.abc import Sequence |
37 | 36 | from dataclasses import dataclass, field |
38 | 37 |
|
@@ -232,27 +231,17 @@ def __init__( |
232 | 231 | ): |
233 | 232 | super().__init__() |
234 | 233 |
|
235 | | - pickle_name = f"dict_{split}_{tokenizer.model_max_length}.pickle" |
236 | 234 | with training_args.main_process_first(): |
237 | | - if os.path.isfile(pickle_name): |
238 | | - with open(pickle_name, "rb") as f: |
239 | | - print_rank_0("Reuse pickled data") |
240 | | - data_dict = pickle.load(f) |
241 | | - else: |
242 | | - print_rank_0("Loading data...") |
243 | | - list_data_dict = utils.jload(data_path) |
244 | | - |
245 | | - print_rank_0("Formatting inputs...") |
246 | | - prompt_input = PROMPT_DICT["prompt_input"] |
247 | | - sources = [prompt_input.format_map(example) for example in list_data_dict] |
248 | | - targets = [ |
249 | | - f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict |
250 | | - ] |
251 | | - |
252 | | - print_rank_0("Tokenizing inputs... This may take some time...") |
253 | | - data_dict = preprocess(sources, targets, tokenizer) |
254 | | - with open(pickle_name, "wb") as f: |
255 | | - pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL) |
| 235 | + print_rank_0("Loading data...") |
| 236 | + list_data_dict = utils.jload(data_path) |
| 237 | + |
| 238 | + print_rank_0("Formatting inputs...") |
| 239 | + prompt_input = PROMPT_DICT["prompt_input"] |
| 240 | + sources = [prompt_input.format_map(example) for example in list_data_dict] |
| 241 | + targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict] |
| 242 | + |
| 243 | + print_rank_0("Tokenizing inputs... This may take some time...") |
| 244 | + data_dict = preprocess(sources, targets, tokenizer) |
256 | 245 |
|
257 | 246 | self.input_ids = data_dict["input_ids"] |
258 | 247 | self.labels = data_dict["labels"] |
|
0 commit comments