llm-compressor/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py at acbbacbbdbcf515ce9428cefa941dc6ce81c9ed6 · vllm-project/llm-compressor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import shutil
import unittest
from pathlib import Path

import pytest
import yaml
from parameterized import parameterized_class
from transformers import AutoModelForCausalLM
from transformers.utils.quantization_config import CompressedTensorsConfig

from llmcompressor.transformers.utils import is_model_ct_quantized_from_path
from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path
from tests.testing_utils import parse_params, requires_gpu

CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs"
GPU_CONFIGS_DIRECTORY = (
    "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs/gpu"
)


class TestConsecutiveRuns(unittest.TestCase):
    quantization_config = CompressedTensorsConfig(run_compressed=False)

    def _test_consecutive_runs(
        self, tolerance: float, num_calibration_samples: int = 16
    ):
        import math

        from llmcompressor.core import active_session
        from llmcompressor.pytorch.model_load.helpers import initialize_recipe
        from llmcompressor.pytorch.utils.helpers import tensor_sparsity
        from llmcompressor.transformers import oneshot
        from llmcompressor.utils.pytorch import qat_active

        # test recipe with 50% sparsity, quantization and smoothquant
        oneshot(
            model=self.model,
            dataset=self.dataset,
            num_calibration_samples=num_calibration_samples,
            recipe=self.first_recipe,
            output_dir=self.output_first,
            oneshot_device=self.device,
            clear_sparse_session=False,
        )

        first_model = AutoModelForCausalLM.from_pretrained(
            self.output_first,
            device_map="auto",
            quantization_config=self.quantization_config,
        )

        layer_0_sparse = tensor_sparsity(
            first_model.model.layers[0].self_attn.k_proj.weight
        )
        assert math.isclose(layer_0_sparse.item(), 0.5, rel_tol=tolerance)
        assert qat_active(first_model)

        session = active_session()
        session_recipe = session.lifecycle.recipe_container.compiled_recipe
        stages = [stage.group for stage in session_recipe.stages]
        self.assertEqual(len(stages), 1)
        session.reset()

        recipe = infer_recipe_from_model_path(model_path=self.output_first)
        if recipe:
            initialize_recipe(model=first_model, recipe_path=recipe)

        # reload saved model and up sparsity to 0.7
        oneshot(
            model=self.output_first,
            dataset=self.dataset,
            num_calibration_samples=num_calibration_samples,
            recipe=self.second_recipe,
            output_dir=self.output_second,
            oneshot_device=self.device,
        )

        second_model = AutoModelForCausalLM.from_pretrained(
            self.output_second,
            device_map="auto",
            quantization_config=self.quantization_config,
        )

        layer_0_sparse = tensor_sparsity(
            second_model.model.layers[0].self_attn.k_proj.weight
        )
        assert math.isclose(layer_0_sparse.item(), 0.7, rel_tol=tolerance)
        assert qat_active(second_model)

        session = active_session()
        session_recipe = session.lifecycle.recipe_container.compiled_recipe
        stages = [stage.group for stage in session_recipe.stages]
        self.assertEqual(len(stages), 2)

        recipe_path = self.output_second / "recipe.yaml"
        recipe_data = yaml.safe_load(recipe_path.read_text())
        stage_keys = recipe_data.keys()
        self.assertEqual(len(stage_keys), 2)
        self.assertIn("test_stage_0", stage_keys)
        self.assertIn("test_stage_1", stage_keys)

    def tearDown(self):
        shutil.rmtree(self.output)


@pytest.mark.integration
@parameterized_class(parse_params(CONFIGS_DIRECTORY))
class TestConsecutiveRunsSmall(TestConsecutiveRuns):
    model = None
    first_recipe = None
    second_recipe = None
    dataset = None

    def setUp(self):
        import torch

        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.output = "./oneshot_output"
        self.output_first = Path(self.output) / "test_1"
        self.output_second = Path(self.output) / "test_2"

    def test_consecutive_runs_small(self):
        self._test_consecutive_runs(tolerance=1e-3)


# TODO: @Satrat and @dsikka, revisit if we want these nightly or weekly
@requires_gpu
@pytest.mark.integration
@parameterized_class(parse_params(GPU_CONFIGS_DIRECTORY))
class TestConsecutiveRunsGPU(TestConsecutiveRuns):
    # Will be populated using the config files
    model = None
    first_recipe = None
    second_recipe = None
    dataset = None
    device = None

    def setUp(self):
        from transformers import AutoModelForCausalLM

        kwargs = {}
        # if optimized self.model is passed, then must be using Linear Modules
        if is_model_ct_quantized_from_path(self.model):
            kwargs["quantization_config"] = self.quantization_config

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model,
            device_map=self.device,
            **kwargs,
        )

        self.output = "./oneshot_output"
        self.output_first = Path(self.output) / "test_1"
        self.output_second = Path(self.output) / "test_2"

    def test_consecutive_runs_gpu(self):
        self._test_consecutive_runs(tolerance=1e-0, num_calibration_samples=16)