Skip to content

Commit 9b9ec34

Browse files
committed
unit tests
1 parent fa768f4 commit 9b9ec34

14 files changed

+800
-0
lines changed

test_utils/__init__.py

Whitespace-only changes.

test_utils/test_config.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from llmtune.pydantic_models.config_model import (
2+
AblationConfig,
3+
BitsAndBytesConfig,
4+
Config,
5+
DataConfig,
6+
InferenceConfig,
7+
LoraConfig,
8+
ModelConfig,
9+
SftArgs,
10+
TrainingArgs,
11+
TrainingConfig,
12+
)
13+
14+
15+
def get_sample_config():
16+
"""Function to return a comprehensive Config object for testing."""
17+
return Config(
18+
save_dir="./test",
19+
ablation=AblationConfig(
20+
use_ablate=False,
21+
),
22+
model=ModelConfig(
23+
hf_model_ckpt="NousResearch/Llama-2-7b-hf",
24+
device_map="auto",
25+
torch_dtype="auto",
26+
quantize=False,
27+
bitsandbytes=BitsAndBytesConfig(
28+
load_in_8bit=False,
29+
load_in_4bit=False,
30+
bnb_4bit_compute_dtype="float32",
31+
bnb_4bit_quant_type="nf4",
32+
bnb_4bit_use_double_quant=True,
33+
),
34+
),
35+
lora=LoraConfig(
36+
r=8,
37+
task_type="CAUSAL_LM",
38+
lora_alpha=16,
39+
bias="none",
40+
lora_dropout=0.1,
41+
target_modules=None,
42+
fan_in_fan_out=False,
43+
),
44+
training=TrainingConfig(
45+
training_args=TrainingArgs(
46+
num_train_epochs=1,
47+
per_device_train_batch_size=1,
48+
gradient_accumulation_steps=1,
49+
optim="adamw_8bit",
50+
learning_rate=2.0e-4,
51+
logging_steps=100,
52+
),
53+
sft_args=SftArgs(max_seq_length=512, neftune_noise_alpha=None),
54+
),
55+
inference=InferenceConfig(
56+
max_length=128,
57+
do_sample=False,
58+
num_beams=5,
59+
temperature=1.0,
60+
top_k=50,
61+
top_p=1.0,
62+
use_cache=True,
63+
),
64+
data=DataConfig(
65+
file_type="json",
66+
path="path/to/dataset.json",
67+
prompt="Your prompt here {column_name}",
68+
prompt_stub="Stub for prompt {column_name}",
69+
train_size=0.9,
70+
test_size=0.1,
71+
train_test_split_seed=42,
72+
),
73+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# TODO

tests/data/test_ingestor.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import pytest
2+
from unittest.mock import patch, MagicMock, mock_open
3+
4+
from llmtune.data.ingestor import (
5+
CsvIngestor,
6+
HuggingfaceIngestor,
7+
JsonIngestor,
8+
JsonlIngestor,
9+
get_ingestor,
10+
)
11+
12+
from datasets import Dataset
13+
14+
15+
def test_get_ingestor():
16+
assert isinstance(get_ingestor("json")(""), JsonIngestor)
17+
assert isinstance(get_ingestor("jsonl")(""), JsonlIngestor)
18+
assert isinstance(get_ingestor("csv")(""), CsvIngestor)
19+
assert isinstance(get_ingestor("huggingface")(""), HuggingfaceIngestor)
20+
21+
with pytest.raises(ValueError):
22+
get_ingestor("unsupported_type")
23+
24+
25+
def test_json_ingestor_to_dataset(mocker):
26+
mock_generator = mocker.patch("llmtune.data.ingestor.JsonIngestor._json_generator")
27+
mock_dataset = mocker.patch("llmtune.data.ingestor.Dataset")
28+
JsonIngestor("").to_dataset()
29+
30+
mock_dataset.from_generator.assert_called_once_with(mock_generator)
31+
32+
33+
def test_jsonl_ingestor_to_dataset(mocker):
34+
mock_generator = mocker.patch(
35+
"llmtune.data.ingestor.JsonlIngestor._jsonl_generator"
36+
)
37+
mock_dataset = mocker.patch("llmtune.data.ingestor.Dataset")
38+
JsonlIngestor("").to_dataset()
39+
40+
mock_dataset.from_generator.assert_called_once_with(mock_generator)
41+
42+
43+
def test_csv_ingestor_to_dataset(mocker):
44+
mock_generator = mocker.patch("llmtune.data.ingestor.CsvIngestor._csv_generator")
45+
mock_dataset = mocker.patch("llmtune.data.ingestor.Dataset")
46+
CsvIngestor("").to_dataset()
47+
48+
mock_dataset.from_generator.assert_called_once_with(mock_generator)
49+
50+
51+
def test_huggingface_to_dataset(mocker):
52+
# Setup
53+
path = "some_path"
54+
ingestor = HuggingfaceIngestor(path)
55+
mock_concatenate_datasets = mocker.patch(
56+
"llmtune.data.ingestor.concatenate_datasets"
57+
)
58+
mock_load_dataset = mocker.patch("llmtune.data.ingestor.load_dataset")
59+
mock_dataset = mocker.patch("llmtune.data.ingestor.Dataset")
60+
61+
# Configure the mock objects
62+
mock_dataset = MagicMock(spec=Dataset)
63+
mock_load_dataset.return_value = {"train": mock_dataset, "test": mock_dataset}
64+
mock_concatenate_datasets.return_value = mock_dataset
65+
66+
# Execute
67+
result = ingestor.to_dataset()
68+
69+
# Assert
70+
assert isinstance(result, Dataset)
71+
mock_load_dataset.assert_called_once_with(path)
72+
mock_concatenate_datasets.assert_called_once()
73+
74+
75+
@pytest.mark.parametrize(
76+
"file_content,expected_output",
77+
[
78+
(
79+
'[{"column1": "value1", "column2": "value2"}, {"column1": "value3", "column2": "value4"}]',
80+
[
81+
{"column1": "value1", "column2": "value2"},
82+
{"column1": "value3", "column2": "value4"},
83+
],
84+
)
85+
],
86+
)
87+
def test_json_ingestor_generator(file_content, expected_output, mocker):
88+
mocker.patch("builtins.open", mock_open(read_data=file_content))
89+
mocker.patch("ijson.items", side_effect=lambda f, prefix: iter(expected_output))
90+
ingestor = JsonIngestor("dummy_path.json")
91+
92+
assert list(ingestor._json_generator()) == expected_output
93+
94+
95+
@pytest.mark.parametrize(
96+
"file_content,expected_output",
97+
[
98+
(
99+
'{"column1": "value1", "column2": "value2"}\n{"column1": "value3", "column2": "value4"}',
100+
[
101+
{"column1": "value1", "column2": "value2"},
102+
{"column1": "value3", "column2": "value4"},
103+
],
104+
)
105+
],
106+
)
107+
def test_jsonl_ingestor_generator(file_content, expected_output, mocker):
108+
mocker.patch("builtins.open", mock_open(read_data=file_content))
109+
mocker.patch(
110+
"ijson.items",
111+
side_effect=lambda f, prefix, multiple_values: (
112+
iter(expected_output) if multiple_values else iter([])
113+
),
114+
)
115+
ingestor = JsonlIngestor("dummy_path.jsonl")
116+
117+
assert list(ingestor._jsonl_generator()) == expected_output
118+
119+
120+
@pytest.mark.parametrize(
121+
"file_content,expected_output",
122+
[
123+
(
124+
"column1,column2\nvalue1,value2\nvalue3,value4",
125+
[
126+
{"column1": "value1", "column2": "value2"},
127+
{"column1": "value3", "column2": "value4"},
128+
],
129+
)
130+
],
131+
)
132+
def test_csv_ingestor_generator(file_content, expected_output, mocker):
133+
mocker.patch("builtins.open", mock_open(read_data=file_content))
134+
ingestor = CsvIngestor("dummy_path.csv")
135+
136+
assert list(ingestor._csv_generator()) == expected_output
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import pytest
2+
3+
from llmtune.finetune.generics import Finetune
4+
5+
6+
class MockFinetune(Finetune):
7+
def finetune(self):
8+
return "finetuning complete"
9+
10+
def save_model(self):
11+
return "model saved"
12+
13+
14+
def test_finetune_method():
15+
mock_finetuner = MockFinetune()
16+
result = mock_finetuner.finetune()
17+
assert result == "finetuning complete"
18+
19+
20+
def test_save_model_method():
21+
mock_finetuner = MockFinetune()
22+
result = mock_finetuner.save_model()
23+
assert result == "model saved"
24+
25+
26+
def test_finetune_abstract_class_instantiation():
27+
with pytest.raises(TypeError):
28+
_ = Finetune()

0 commit comments

Comments
 (0)