Skip to content

Commit d6501d3

Browse files
authored
remove old dataset. (#2561)
1 parent 0cd6eb3 commit d6501d3

15 files changed

+24
-1680
lines changed

examples/run_finetune.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def main():
148148
model_config.fuse_attention_ffn = model_args.fuse_attention_ffn
149149
model_config.pp_seg_method = training_args.pp_seg_method
150150
model_config.seq_length = data_args.max_length
151-
model_config.max_sequence_length = training_args.max_seq_length
151+
model_config.max_sequence_length = training_args.max_seq_len
152152
model_config.num_nextn_predict_layers = model_args.num_nextn_predict_layers
153153
logger.info(f"Final model config: {model_config}")
154154
logger.info("Creating model")
@@ -213,11 +213,11 @@ def neft_post_hook(module, input, output):
213213

214214
dataset_config = {
215215
"tokenizer": tokenizer,
216-
"max_seq_len": training_args.max_seq_length,
216+
"max_seq_len": training_args.max_seq_len,
217217
"random_seed": training_args.seed,
218-
"num_replicas": 1,
219-
"rank": 0,
220-
"num_samples_each_epoch": 6000000,
218+
"num_replicas": training_args.dataset_world_size,
219+
"rank": training_args.dataset_rank,
220+
"num_samples_each_epoch": data_args.num_samples_each_epoch,
221221
"random_shuffle": data_args.random_shuffle,
222222
"greedy_intokens": data_args.greedy_intokens,
223223
"packing": data_args.packing,
@@ -251,7 +251,7 @@ def neft_post_hook(module, input, output):
251251
collate_fn,
252252
tokenizer=tokenizer,
253253
model_args=model_args,
254-
max_seq_len=training_args.max_seq_length + model_config.num_nextn_predict_layers,
254+
max_seq_len=training_args.max_seq_len + model_config.num_nextn_predict_layers,
255255
)
256256
trainer = SFTTrainer(
257257
model=model,

paddleformers/data/__init__.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,6 @@
2020

2121
import_structure = {
2222
"sampler": ["SamplerHelper"],
23-
"causal_dataset": [
24-
"check_data_split",
25-
"get_train_valid_test_split_",
26-
"get_datasets_weights_and_num_samples",
27-
"print_rank_0",
28-
"build_train_valid_test_datasets",
29-
"_build_train_valid_test_datasets",
30-
"get_indexed_dataset_",
31-
"GPTDataset",
32-
"_build_index_mappings",
33-
"_num_tokens",
34-
"_num_epochs",
35-
"_build_doc_idx",
36-
"_build_sample_idx",
37-
"_build_shuffle_idx",
38-
],
3923
"data_collator": [
4024
"DataCollatorForSeq2Seq",
4125
"default_data_collator",
@@ -54,7 +38,6 @@
5438
"DataCollatorForLanguageModeling",
5539
],
5640
"dist_dataloader": ["DummyDataset", "IterableDummyDataset", "DistDataLoader", "init_dataloader_comm_group"],
57-
"blendable_dataset": ["print_rank_0", "BlendableDataset"],
5841
"collate": ["Dict", "Pad", "Stack", "Tuple"],
5942
"vocab": ["Vocab"],
6043
"tokenizer": ["BaseTokenizer"],
@@ -91,8 +74,6 @@
9174

9275

9376
if TYPE_CHECKING:
94-
from .blendable_dataset import *
95-
from .causal_dataset import *
9677
from .collate import *
9778
from .data_collator import *
9879
from .dist_dataloader import *

paddleformers/data/blendable_dataset.py

Lines changed: 0 additions & 184 deletions
This file was deleted.

0 commit comments

Comments
 (0)