-
Notifications
You must be signed in to change notification settings - Fork 458
Expand file tree
/
Copy pathtest_consecutive_runs.py
More file actions
157 lines (127 loc) · 5.34 KB
/
test_consecutive_runs.py
File metadata and controls
157 lines (127 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import shutil
import unittest
from pathlib import Path
import pytest
import yaml
from parameterized import parameterized_class
from transformers import AutoModelForCausalLM
from transformers.utils.quantization_config import CompressedTensorsConfig
from llmcompressor.transformers.utils import is_model_ct_quantized_from_path
from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
from tests.testing_utils import parse_params, requires_gpu
CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs"
GPU_CONFIGS_DIRECTORY = (
"tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/gpu"
)
class TestConsecutiveRuns(unittest.TestCase):
quantization_config = CompressedTensorsConfig(run_compressed=False)
def _test_consecutive_runs(
self, tolerance: float, num_calibration_samples: int = 16
):
import math
from llmcompressor.core import active_session
from llmcompressor.pytorch.model_load.helpers import initialize_recipe
from llmcompressor.pytorch.utils.helpers import tensor_sparsity
from llmcompressor.transformers import oneshot
from llmcompressor.utils.pytorch import qat_active
# test recipe with 50% sparsity, quantization and smoothquant
oneshot(
model=self.model,
dataset=self.dataset,
num_calibration_samples=num_calibration_samples,
recipe=self.first_recipe,
output_dir=self.output_first,
oneshot_device=self.device,
clear_sparse_session=False,
)
first_model = AutoModelForCausalLM.from_pretrained(
self.output_first,
device_map="auto",
quantization_config=self.quantization_config,
)
layer_0_sparse = tensor_sparsity(
first_model.model.layers[0].self_attn.k_proj.weight
)
assert math.isclose(layer_0_sparse.item(), 0.5, rel_tol=tolerance)
assert qat_active(first_model)
session = active_session()
session_recipe = session.lifecycle.recipe_container.compiled_recipe
stages = [stage.group for stage in session_recipe.stages]
self.assertEqual(len(stages), 1)
session.reset()
recipe = infer_recipe_from_model_path(model_path=self.output_first)
if recipe:
initialize_recipe(model=first_model, recipe_path=recipe)
# reload saved model and up sparsity to 0.7
oneshot(
model=self.output_first,
dataset=self.dataset,
num_calibration_samples=num_calibration_samples,
recipe=self.second_recipe,
output_dir=self.output_second,
oneshot_device=self.device,
)
second_model = AutoModelForCausalLM.from_pretrained(
self.output_second,
device_map="auto",
quantization_config=self.quantization_config,
)
layer_0_sparse = tensor_sparsity(
second_model.model.layers[0].self_attn.k_proj.weight
)
assert math.isclose(layer_0_sparse.item(), 0.7, rel_tol=tolerance)
assert qat_active(second_model)
session = active_session()
session_recipe = session.lifecycle.recipe_container.compiled_recipe
stages = [stage.group for stage in session_recipe.stages]
self.assertEqual(len(stages), 2)
recipe_path = self.output_second / "recipe.yaml"
recipe_data = yaml.safe_load(recipe_path.read_text())
stage_keys = recipe_data.keys()
self.assertEqual(len(stage_keys), 2)
self.assertIn("test_stage_0", stage_keys)
self.assertIn("test_stage_1", stage_keys)
def tearDown(self):
shutil.rmtree(self.output)
@pytest.mark.integration
@parameterized_class(parse_params(CONFIGS_DIRECTORY))
class TestConsecutiveRunsSmall(TestConsecutiveRuns):
model = None
first_recipe = None
second_recipe = None
dataset = None
def setUp(self):
import torch
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.output = "./oneshot_output"
self.output_first = Path(self.output) / "test_1"
self.output_second = Path(self.output) / "test_2"
def test_consecutive_runs_small(self):
self._test_consecutive_runs(tolerance=1e-3)
# TODO: @Satrat and @dsikka, revisit if we want these nightly or weekly
@requires_gpu
@pytest.mark.integration
@parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY))
class TestConsecutiveRunsGPU(TestConsecutiveRuns):
# Will be populated using the config files
model = None
first_recipe = None
second_recipe = None
dataset = None
device = None
def setUp(self):
from transformers import AutoModelForCausalLM
kwargs = {}
# if optimized self.model is passed, then must be using Linear Modules
if is_model_ct_quantized_from_path(self.model):
kwargs["quantization_config"] = self.quantization_config
self.model = AutoModelForCausalLM.from_pretrained(
self.model,
device_map=self.device,
**kwargs,
)
self.output = "./oneshot_output"
self.output_first = Path(self.output) / "test_1"
self.output_second = Path(self.output) / "test_2"
def test_consecutive_runs_gpu(self):
self._test_consecutive_runs(tolerance=1e-0, num_calibration_samples=16)