Skip to content

Commit d52ff53

Browse files
committed
test: pretokenized arrow dataset
Signed-off-by: Will Johnson <mwjohnson728@gmail.com>
1 parent 8b2dca2 commit d52ff53

File tree

3 files changed

+24
-0
lines changed

3 files changed

+24
-0
lines changed

tests/artifacts/testdata/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
TWITTER_COMPLAINTS_TOKENIZED_JSONL = os.path.join(
3838
DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.jsonl"
3939
)
40+
TWITTER_COMPLAINTS_TOKENIZED_ARROW = os.path.join(
41+
DATA_DIR, "twitter_complaints_tokenized_with_maykeye_tinyllama_v0.arrow"
42+
)
4043
EMPTY_DATA = os.path.join(DATA_DIR, "empty_data.json")
4144
MALFORMATTED_DATA = os.path.join(DATA_DIR, "malformatted_data.json")
4245
MODEL_NAME = "Maykeye/TinyLLama-v0"
Binary file not shown.

tests/data/test_data_preprocessing_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
TWITTER_COMPLAINTS_DATA_JSONL,
4141
TWITTER_COMPLAINTS_TOKENIZED_JSON,
4242
TWITTER_COMPLAINTS_TOKENIZED_JSONL,
43+
TWITTER_COMPLAINTS_TOKENIZED_ARROW,
4344
)
4445

4546
# Local
@@ -79,6 +80,20 @@
7980
]
8081
),
8182
),
83+
(
84+
TWITTER_COMPLAINTS_TOKENIZED_ARROW,
85+
set(
86+
[
87+
"Tweet text",
88+
"ID",
89+
"Label",
90+
"text_label",
91+
"output",
92+
"input_ids",
93+
"labels",
94+
]
95+
),
96+
),
8297
(
8398
TWITTER_COMPLAINTS_DATA_JSONL,
8499
set(["Tweet text", "ID", "Label", "text_label", "output"]),
@@ -519,6 +534,12 @@ def test_process_dataargs(data_args):
519534
training_data_path=TWITTER_COMPLAINTS_TOKENIZED_JSONL,
520535
)
521536
),
537+
# JSONL pretokenized train datasets
538+
(
539+
configs.DataArguments(
540+
training_data_path=TWITTER_COMPLAINTS_TOKENIZED_ARROW,
541+
)
542+
),
522543
],
523544
)
524545
def test_process_dataargs_pretokenized(data_args):

0 commit comments

Comments
 (0)