3434 MODEL_NAME ,
3535 TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON ,
3636 TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL ,
37+ TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
3738 TWITTER_COMPLAINTS_DATA_JSON ,
3839 TWITTER_COMPLAINTS_DATA_JSONL ,
40+ TWITTER_COMPLAINTS_DATA_PARQUET ,
3941 TWITTER_COMPLAINTS_TOKENIZED_JSON ,
4042 TWITTER_COMPLAINTS_TOKENIZED_JSONL ,
43+ TWITTER_COMPLAINTS_TOKENIZED_PARQUET ,
4144)
4245
4346# Local
5962 TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL ,
6063 set (["ID" , "Label" , "input" , "output" ]),
6164 ),
65+ (
66+ TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
67+ set (["ID" , "Label" , "input" , "output" ]),
68+ ),
6269 (
6370 TWITTER_COMPLAINTS_TOKENIZED_JSONL ,
6471 set (
7380 ]
7481 ),
7582 ),
83+ (
84+ TWITTER_COMPLAINTS_TOKENIZED_PARQUET ,
85+ set (
86+ [
87+ "Tweet text" ,
88+ "ID" ,
89+ "Label" ,
90+ "text_label" ,
91+ "output" ,
92+ "input_ids" ,
93+ "labels" ,
94+ ]
95+ ),
96+ ),
7697 (
7798 TWITTER_COMPLAINTS_DATA_JSONL ,
7899 set (["Tweet text" , "ID" , "Label" , "text_label" , "output" ]),
79100 ),
101+ (
102+ TWITTER_COMPLAINTS_DATA_PARQUET ,
103+ set (["Tweet text" , "ID" , "Label" , "text_label" , "output" ]),
104+ ),
80105 ],
81106)
82107def test_load_dataset_with_datafile (datafile , column_names ):
@@ -98,6 +123,11 @@ def test_load_dataset_with_datafile(datafile, column_names):
98123 set (["ID" , "Label" , "input" , "output" ]),
99124 "text_dataset_input_output_masking" ,
100125 ),
126+ (
127+ TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
128+ set (["ID" , "Label" , "input" , "output" ]),
129+ "text_dataset_input_output_masking" ,
130+ ),
101131 (
102132 TWITTER_COMPLAINTS_TOKENIZED_JSONL ,
103133 set (
@@ -113,11 +143,31 @@ def test_load_dataset_with_datafile(datafile, column_names):
113143 ),
114144 "pretokenized_dataset" ,
115145 ),
146+ (
147+ TWITTER_COMPLAINTS_TOKENIZED_PARQUET ,
148+ set (
149+ [
150+ "Tweet text" ,
151+ "ID" ,
152+ "Label" ,
153+ "text_label" ,
154+ "output" ,
155+ "input_ids" ,
156+ "labels" ,
157+ ]
158+ ),
159+ "pretokenized_dataset" ,
160+ ),
116161 (
117162 TWITTER_COMPLAINTS_DATA_JSONL ,
118163 set (["Tweet text" , "ID" , "Label" , "text_label" , "output" ]),
119164 "apply_custom_data_template" ,
120165 ),
166+ (
167+ TWITTER_COMPLAINTS_DATA_PARQUET ,
168+ set (["Tweet text" , "ID" , "Label" , "text_label" , "output" ]),
169+ "apply_custom_data_template" ,
170+ ),
121171 ],
122172)
123173def test_load_dataset_with_datasetconfig (datafile , column_names , datasetconfigname ):
@@ -139,8 +189,14 @@ def test_load_dataset_with_datasetconfig(datafile, column_names, datasetconfigna
139189 TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL ,
140190 "text_dataset_input_output_masking" ,
141191 ),
192+ (
193+ TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
194+ "text_dataset_input_output_masking" ,
195+ ),
142196 (TWITTER_COMPLAINTS_TOKENIZED_JSONL , "pretokenized_dataset" ),
197+ (TWITTER_COMPLAINTS_TOKENIZED_PARQUET , "pretokenized_dataset" ),
143198 (TWITTER_COMPLAINTS_DATA_JSONL , "apply_custom_data_template" ),
199+ (TWITTER_COMPLAINTS_DATA_PARQUET , "apply_custom_data_template" ),
144200 ],
145201)
146202def test_load_dataset_with_dataconfig_and_datafile (datafile , datasetconfigname ):
@@ -339,8 +395,10 @@ def test_process_data_args_throws_error_where_needed(data_args, packing):
339395 [
340396 (APPLY_CUSTOM_TEMPLATE_YAML , TWITTER_COMPLAINTS_DATA_JSON ),
341397 (APPLY_CUSTOM_TEMPLATE_YAML , TWITTER_COMPLAINTS_DATA_JSONL ),
398+ (APPLY_CUSTOM_TEMPLATE_YAML , TWITTER_COMPLAINTS_DATA_PARQUET ),
342399 (PRETOKENIZE_JSON_DATA_YAML , TWITTER_COMPLAINTS_TOKENIZED_JSON ),
343400 (PRETOKENIZE_JSON_DATA_YAML , TWITTER_COMPLAINTS_TOKENIZED_JSONL ),
401+ (PRETOKENIZE_JSON_DATA_YAML , TWITTER_COMPLAINTS_TOKENIZED_PARQUET ),
344402 (
345403 TOKENIZE_AND_APPLY_INPUT_MASKING_YAML ,
346404 TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON ,
@@ -349,6 +407,10 @@ def test_process_data_args_throws_error_where_needed(data_args, packing):
349407 TOKENIZE_AND_APPLY_INPUT_MASKING_YAML ,
350408 TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL ,
351409 ),
410+ (
411+ TOKENIZE_AND_APPLY_INPUT_MASKING_YAML ,
412+ TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
413+ ),
352414 ],
353415)
354416def test_process_dataconfig_file (data_config_path , data_path ):
@@ -414,6 +476,15 @@ def test_process_dataconfig_file(data_config_path, data_path):
414476 response_template = "\n ### Label:" ,
415477 )
416478 ),
479+ # single sequence PARQUET and response template
480+ (
481+ configs .DataArguments (
482+ training_data_path = TWITTER_COMPLAINTS_DATA_PARQUET ,
483+ validation_data_path = TWITTER_COMPLAINTS_DATA_PARQUET ,
484+ dataset_text_field = "output" ,
485+ response_template = "\n ### Label:" ,
486+ )
487+ ),
417488 # data formatter template with input/output JSON
418489 (
419490 configs .DataArguments (
@@ -432,6 +503,15 @@ def test_process_dataconfig_file(data_config_path, data_path):
432503 response_template = "\n ### Label:" ,
433504 )
434505 ),
506+ # data formatter template with input/output PARQUET
507+ (
508+ configs .DataArguments (
509+ training_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
510+ validation_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
511+ data_formatter_template = "### Text:{{input}} \n \n ### Label: {{output}}" ,
512+ response_template = "\n ### Label:" ,
513+ )
514+ ),
435515 # input/output JSON with masking on input
436516 (
437517 configs .DataArguments (
@@ -446,6 +526,13 @@ def test_process_dataconfig_file(data_config_path, data_path):
446526 validation_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL ,
447527 )
448528 ),
529+ # input/output PARQUET with masking on input
530+ (
531+ configs .DataArguments (
532+ training_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
533+ validation_data_path = TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_PARQUET ,
534+ )
535+ ),
449536 ],
450537)
451538def test_process_dataargs (data_args ):
@@ -487,6 +574,13 @@ def test_process_dataargs(data_args):
487574 validation_data_path = TWITTER_COMPLAINTS_TOKENIZED_JSONL ,
488575 )
489576 ),
577+ # PARQUET pretokenized train and validation datasets
578+ (
579+ configs .DataArguments (
580+ training_data_path = TWITTER_COMPLAINTS_TOKENIZED_PARQUET ,
581+ validation_data_path = TWITTER_COMPLAINTS_TOKENIZED_PARQUET ,
582+ )
583+ ),
490584 # JSON pretokenized train datasets
491585 (
492586 configs .DataArguments (
@@ -499,6 +593,12 @@ def test_process_dataargs(data_args):
499593 training_data_path = TWITTER_COMPLAINTS_TOKENIZED_JSONL ,
500594 )
501595 ),
596+ # PARQUET pretokenized train datasets
597+ (
598+ configs .DataArguments (
599+ training_data_path = TWITTER_COMPLAINTS_TOKENIZED_PARQUET ,
600+ )
601+ ),
502602 ],
503603)
504604def test_process_dataargs_pretokenized (data_args ):
0 commit comments