Skip to content

Commit 20185d1

Browse files
feat: add error handling for split dataset feat (#581)
Signed-off-by: yashasvi <[email protected]>
1 parent f0b55bf commit 20185d1

File tree

2 files changed

+30
-0
lines changed

2 files changed

+30
-0
lines changed

tests/data/test_data_preprocessing.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,10 @@ def test_process_dataconfig_file_with_streaming(data_config_path, data_path):
774774
assert set(["input_ids", "labels"]).issubset(set(train_set.column_names))
775775
elif datasets_name == "apply_custom_data_template":
776776
assert formatted_dataset_field in set(train_set.column_names)
777+
with pytest.raises(ValueError):
778+
_ = process_dataconfig_file(
779+
data_args, TRAIN_ARGS, tokenizer, is_padding_free=True
780+
)
777781

778782

779783
def test_concatenate_dict_with_multi_keys():
@@ -1454,6 +1458,11 @@ def test_process_dataconfig_multiple_datasets_datafiles_sampling(
14541458
assert set(["input_ids", "attention_mask", "labels"]).issubset(
14551459
set(eval_set.column_names)
14561460
)
1461+
TRAIN_ARGS.eval_strategy = "epoch"
1462+
with pytest.raises(ValueError):
1463+
train_set, eval_set, _, _, _, _ = process_dataargs(
1464+
data_args=data_args, tokenizer=tokenizer, train_args=TRAIN_ARGS
1465+
)
14571466

14581467

14591468
@pytest.mark.parametrize(

tuning/data/setup_dataprocessor.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def process_dataconfig_file(
7070
additional_data_handlers: Dict[str, DataHandler] = None,
7171
processor: AutoProcessor = None,
7272
is_multipack: bool = False,
73+
is_padding_free: bool = False,
7374
):
7475
"""
7576
Args:
@@ -114,6 +115,14 @@ def process_dataconfig_file(
114115
)
115116

116117
if data_processor.processor_config.streaming:
118+
if is_padding_free:
119+
logging.error(
120+
"`padding_free` is not supported when streaming is enabled.",
121+
)
122+
raise ValueError(
123+
"`--padding_free` is not allowed when `streaming=True`. "
124+
"Please remove the `padding_free` argument from your configuration."
125+
)
117126
if train_args.max_steps < 1:
118127
logging.error(
119128
"ValueError: `--max_steps` must be set when streaming is set in data "
@@ -549,6 +558,7 @@ def process_dataargs(
549558
additional_data_handlers,
550559
processor,
551560
is_multipack,
561+
is_padding_free,
552562
)
553563
else:
554564
train_dataset, eval_dataset, dataset_text_field = _process_raw_data_args(
@@ -561,6 +571,17 @@ def process_dataargs(
561571
processor,
562572
)
563573

574+
if train_args.eval_strategy != "no" and eval_dataset is None:
575+
raise ValueError(
576+
f"`eval_strategy` is set to '{train_args.eval_strategy}' but no evaluation "
577+
f"dataset was provided. Please ensure that an evaluation dataset is specified "
578+
f"or set `eval_strategy='no'` to disable evaluation."
579+
)
580+
if train_dataset is None:
581+
raise ValueError(
582+
"Training dataset could not be created! Training Dataset is None."
583+
"Check your data config or ensure split sizes are valid."
584+
)
564585
if data_args.do_dataprocessing_only:
565586
dump_dir = Path(train_args.output_dir)
566587
if not dump_dir.is_absolute():

0 commit comments

Comments
 (0)