Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_combined_datasets():

@pytest.mark.unit
def test_separate_datasets():
splits = {"train": "train[:10%]", "validation": "train[10%:20%]"}
splits = {"train": "train[:5%]", "validation": "train[5%:7%]"}
data_args = DatasetArguments(
dataset="wikitext", dataset_config_name="wikitext-2-raw-v1"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_no_padding_tokenization(self):
op_manager = TextGenerationDataset.load_from_registry(
self.data_args.dataset,
data_args=self.data_args,
split="train[5%:10%]",
split="train[5%:7%]",
processor=self.tiny_llama_tokenizer,
)
dataset = op_manager.load_dataset() # load
Expand All @@ -82,7 +82,7 @@ def test_no_padding_tokenization(self):
ex_item = dataset[0]["text"]
self.assertIn("Below is an instruction that describes a task", ex_item)

self.assertEqual(dataset.split, "train[5%:10%]")
self.assertEqual(dataset.split, "train[5%:7%]")
tokenized_dataset = op_manager()
self.assertIn("input_ids", tokenized_dataset.features)
self.assertIn("labels", tokenized_dataset.features)
Expand All @@ -107,7 +107,7 @@ def test_max_seq_len_clipped(self):
op_manager = TextGenerationDataset.load_from_registry(
self.data_args.dataset,
data_args=self.data_args,
split="train[80%:]",
split="train[95%:]",
processor=self.tiny_llama_tokenizer,
)

Expand Down Expand Up @@ -136,15 +136,15 @@ def test_dataset_kwargs_and_percentages(self):
c4_manager_a = TextGenerationDataset.load_from_registry(
self.data_args.dataset,
data_args=self.data_args,
split="train[5%:10%]",
split="train[5%:6%]",
processor=self.tiny_llama_tokenizer,
)
raw_dataset_a = c4_manager_a.load_dataset()

c4_manager_b = TextGenerationDataset.load_from_registry(
self.data_args.dataset,
data_args=self.data_args,
split="train[5%:15%]",
split="train[6%:8%]",
processor=self.tiny_llama_tokenizer,
)
raw_dataset_b = c4_manager_b.load_dataset()
Expand All @@ -162,7 +162,7 @@ def prepare_fixture(self, tiny_llama_tokenizer):
[
["ptb", "penn_treebank", "train[:5%]", False],
["gsm8k", "main", "train[:5%]", True],
["ultrachat_200k", "default", "train_sft[:2%]", False],
["ultrachat_200k", "default", "train_sft[:1%]", False],
]
)
def test_datasets(self, dataset_key, dataset_config, split, do_concat):
Expand Down Expand Up @@ -271,9 +271,7 @@ class TestSplitLoading(unittest.TestCase):
def prepare_fixture(self, tiny_llama_tokenizer):
self.tiny_llama_tokenizer = tiny_llama_tokenizer

@parameterized.expand(
[["train"], ["train[60%:]"], [{"train": "train[:20%]"}], [None]]
)
@parameterized.expand([["train[95%:]"], [{"train": "train[:5%]"}]])
def test_split_loading(self, split_def):
data_args = DatasetArguments(
dataset="open_platypus",
Expand Down Expand Up @@ -302,7 +300,7 @@ class TestTokenizationDataset(unittest.TestCase):
def prepare_fixture(self, tiny_llama_tokenizer):
self.tiny_llama_tokenizer = tiny_llama_tokenizer
dataset = load_dataset("garage-bAInd/Open-Platypus")["train"]
self.num_calib_samples = 256
self.num_calib_samples = 64
self.max_seq_len = 512
self.dataset = dataset.shuffle(seed=42).select(range(self.num_calib_samples))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ model: "Xenova/llama2.c-stories15M"
dataset: wikitext
dataset_config_name: "wikitext-2-raw-v1"
recipe: "tests/llmcompressor/transformers/finetune/test_alternate_recipe.yaml"
num_train_epochs: 1
num_train_epochs: 0.25
concat_txt: False
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ class TestOneshotAndFinetune(unittest.TestCase):
def _test_oneshot_and_finetune(self):
from llmcompressor.transformers import apply

splits = {"train": "train[:30%]", "calibration": "train[30%:40%]"}
splits = {"train": "train[:5%]", "calibration": "train[5%:10%]"}
if self.dataset == "ultrachat-200k":
splits = {"train": "train_gen[:30%]", "calibration": "train_gen[30%:40%]"}
splits = {"train": "train_gen[:5%]", "calibration": "train_gen[5%:10%]"}

apply(
model=self.model,
Expand All @@ -30,6 +30,7 @@ def _test_oneshot_and_finetune(self):
output_dir=self.output,
recipe=self.recipe,
num_train_epochs=self.num_train_epochs,
num_calibration_samples=64,
concatenate_data=self.concat_txt,
splits=splits,
oneshot_device=self.device,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_oneshot_sparsification_then_finetune(self):
concatenate_data = False
num_calibration_samples = 64
output_dir = self.output / "oneshot_out"
splits = {"calibration": "train[:10%]"}
splits = {"calibration": "train[:5%]"}

with create_session():
oneshot(
Expand Down Expand Up @@ -56,20 +56,18 @@ def test_oneshot_sparsification_then_finetune(self):
dataset = "open_platypus"
concatenate_data = False
output_dir = self.output / "finetune_out"
splits = "train[:50%]"
max_steps = 25
splits = "train[5%:7%]"

with create_session():
train(
model=model,
distill_teacher=distill_teacher,
dataset=dataset,
output_dir=output_dir,
num_calibration_samples=num_calibration_samples,
num_train_epochs=0.05,
recipe=recipe_str,
concatenate_data=concatenate_data,
splits=splits,
max_steps=max_steps,
)

# test reloading checkpoint and final model
Expand All @@ -85,11 +83,10 @@ def test_oneshot_sparsification_then_finetune(self):
distill_teacher=distill_teacher,
dataset=dataset,
output_dir=output_dir,
num_calibration_samples=num_calibration_samples,
num_train_epochs=0.05,
recipe=recipe_str,
concatenate_data=concatenate_data,
splits=splits,
max_steps=max_steps,
resume_from_checkpoint=True, # use last checkpoint
)

Expand All @@ -106,7 +103,7 @@ def test_oneshot_quantization_then_finetune(self):
concatenate_data = False
num_calibration_samples = 64
output_dir = self.output / "oneshot_out"
splits = {"calibration": "train[:10%]"}
splits = {"calibration": "train[:5%]"}

with create_session():
oneshot(
Expand All @@ -130,17 +127,17 @@ def test_oneshot_quantization_then_finetune(self):
dataset = "open_platypus"
concatenate_data = False
output_dir = self.output / "finetune_out"
splits = {"calibration": "train[:10%]", "train": "train[:10%]"}
splits = {"calibration": "train[:5%]", "train": "train[5%:7%]"}

with create_session():
train(
model=model,
dataset=dataset,
output_dir=output_dir,
num_calibration_samples=num_calibration_samples,
recipe=recipe,
concatenate_data=concatenate_data,
splits=splits,
num_train_epochs=0.05,
)

# test reloading checkpoint and final model
Expand All @@ -152,10 +149,10 @@ def test_oneshot_quantization_then_finetune(self):
model=model,
dataset=dataset,
output_dir=output_dir,
num_calibration_samples=num_calibration_samples,
recipe=recipe,
concatenate_data=concatenate_data,
splits=splits,
num_train_epochs=0.05,
resume_from_checkpoint=True, # use last checkpoint
)

Expand Down