Skip to content
This repository was archived by the owner on Sep 4, 2025. It is now read-only.

Commit 2188a60

Browse files
authored
[Misc] Update GPTQ to use vLLMParameters (vllm-project#7976)
1 parent dc0b606 commit 2188a60

File tree

6 files changed

+93
-62
lines changed

6 files changed

+93
-62
lines changed

tests/weight_loading/models.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
44
gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
55
gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
66
gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main
7+
gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main
8+
gptq, TheBloke/Llama-2-7B-GPTQ, main
9+
gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main
10+
gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True
11+
gptq, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True
12+
gptq, TechxGenus/gemma-1.1-2b-it-GPTQ, main
713
compressed-tensors, nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change, main
814
compressed-tensors, nm-testing/tinyllama-oneshot-w8-channel-a8-tensor, main
915
compressed-tensors, nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2, main

tests/weight_loading/test_weight_loading.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
22

3+
import torch
4+
35
MAX_MODEL_LEN = 1024
46
MODEL_NAME = os.environ.get("MODEL_NAME",
57
"robertgshaw2/zephyr-7b-beta-channelwise-gptq")
@@ -8,9 +10,12 @@
810

911

1012
def test_weight_loading(vllm_runner):
13+
"""
14+
Test parameter weight loading with tp>1.
15+
"""
1116
with vllm_runner(model_name=MODEL_NAME,
1217
revision=REVISION,
13-
dtype="auto",
18+
dtype=torch.half if QUANTIZATION == "gptq" else "auto",
1419
quantization=QUANTIZATION,
1520
max_model_len=MAX_MODEL_LEN,
1621
tensor_parallel_size=2) as model:

vllm/model_executor/layers/linear.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
from vllm.model_executor.layers.quantization.base_config import (
1515
QuantizationConfig, QuantizeMethodBase)
1616
from vllm.model_executor.parameter import (BasevLLMParameter,
17+
PackedColumnParameter,
1718
PackedvLLMParameter,
18-
PerTensorScaleParameter)
19+
PerTensorScaleParameter,
20+
RowvLLMParameter)
1921
from vllm.model_executor.utils import set_weight_attrs
2022

2123
logger = init_logger(__name__)
@@ -24,7 +26,7 @@
2426
"CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
2527
"AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
2628
"MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
27-
"TPUInt8LinearMethod"
29+
"TPUInt8LinearMethod", "GPTQLinearMethod"
2830
]
2931

3032

@@ -574,8 +576,8 @@ def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
574576
# Special case for Quantization.
575577
# If quantized, we need to adjust the offset and size to account
576578
# for the packing.
577-
if isinstance(param, PackedvLLMParameter
578-
) and param.packed_dim == param.output_dim:
579+
if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
580+
)) and param.packed_dim == param.output_dim:
579581
shard_size, shard_offset = \
580582
param.adjust_shard_indexes_for_packing(
581583
shard_size=shard_size, shard_offset=shard_offset)
@@ -594,9 +596,10 @@ def weight_loader_v2(self,
594596
param.load_merged_column_weight(loaded_weight=loaded_weight,
595597
shard_id=0)
596598
return
597-
elif type(param) is BasevLLMParameter:
599+
elif type(param) in (RowvLLMParameter, BasevLLMParameter):
598600
param.load_merged_column_weight(loaded_weight=loaded_weight)
599601
return
602+
# TODO: @dsikka - move to parameter.py
600603
self._load_fused_module_from_checkpoint(param, loaded_weight)
601604
return
602605

@@ -724,8 +727,8 @@ def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
724727
# Special case for Quantization.
725728
# If quantized, we need to adjust the offset and size to account
726729
# for the packing.
727-
if isinstance(param, PackedvLLMParameter
728-
) and param.packed_dim == param.output_dim:
730+
if isinstance(param, (PackedColumnParameter, PackedvLLMParameter
731+
)) and param.packed_dim == param.output_dim:
729732
shard_size, shard_offset = \
730733
param.adjust_shard_indexes_for_packing(
731734
shard_size=shard_size, shard_offset=shard_offset)
@@ -741,12 +744,12 @@ def weight_loader_v2(self,
741744
loaded_shard_id: Optional[str] = None):
742745
if loaded_shard_id is None: # special case for certain models
743746
if isinstance(param, PerTensorScaleParameter):
744-
param.load_merged_column_weight(loaded_weight=loaded_weight,
745-
shard_id=0)
747+
param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
746748
return
747-
elif type(param) is BasevLLMParameter:
748-
param.load_merged_column_weight(loaded_weight=loaded_weight)
749+
elif type(param) in (RowvLLMParameter, BasevLLMParameter):
750+
param.load_qkv_weight(loaded_weight=loaded_weight)
749751
return
752+
# TODO: @dsikka - move to parameter.py
750753
self._load_fused_module_from_checkpoint(param, loaded_weight)
751754
return
752755

vllm/model_executor/layers/quantization/gptq.py

Lines changed: 58 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
from vllm.model_executor.layers.quantization.base_config import (
1212
QuantizationConfig)
1313
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
14-
from vllm.model_executor.utils import set_weight_attrs
14+
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
15+
GroupQuantScaleParameter,
16+
PackedColumnParameter,
17+
PackedvLLMParameter,
18+
RowvLLMParameter)
1519

1620

1721
class GPTQConfig(QuantizationConfig):
@@ -108,6 +112,7 @@ def create_weights(
108112
**extra_weight_attrs,
109113
):
110114
del output_size # Unused.
115+
weight_loader = extra_weight_attrs.get("weight_loader")
111116
if input_size_per_partition % self.quant_config.group_size != 0:
112117
raise ValueError(
113118
"The input size is not aligned with the quantized "
@@ -138,73 +143,81 @@ def create_weights(
138143
scale_and_zero_size = input_size_per_partition // group_size
139144
scale_and_zero_input_dim = 0
140145

141-
qweight = Parameter(
142-
torch.empty(
146+
qweight = PackedvLLMParameter(
147+
data=torch.empty(
143148
input_size_per_partition // self.quant_config.pack_factor,
144149
output_size_per_partition,
145150
dtype=torch.int32,
146151
),
147-
requires_grad=False,
148-
)
149-
set_weight_attrs(
150-
qweight, {
151-
"input_dim": 0,
152-
"output_dim": 1,
153-
"packed_dim": 0,
154-
"pack_factor": self.quant_config.pack_factor,
155-
})
156-
g_idx = Parameter(
157-
torch.tensor(
158-
[
159-
i // self.quant_config.group_size
160-
for i in range(input_size_per_partition)
161-
],
162-
dtype=torch.int32,
163-
),
164-
requires_grad=False,
165-
)
166-
# Ignore warning from fused linear layers such as QKVParallelLinear.
167-
set_weight_attrs(g_idx, {"input_dim": 0, "ignore_warning": True})
168-
qzeros = Parameter(
152+
input_dim=0,
153+
output_dim=1,
154+
packed_dim=0,
155+
packed_factor=self.quant_config.pack_factor,
156+
weight_loader=weight_loader)
157+
158+
g_idx = RowvLLMParameter(data=torch.tensor(
159+
[
160+
i // self.quant_config.group_size
161+
for i in range(input_size_per_partition)
162+
],
163+
dtype=torch.int32,
164+
),
165+
input_dim=0,
166+
weight_loader=weight_loader)
167+
qzeros_args = {
168+
"data":
169169
torch.empty(
170170
scale_and_zero_size,
171171
output_size_per_partition // self.quant_config.pack_factor,
172172
dtype=torch.int32,
173173
),
174-
requires_grad=False,
175-
)
176-
set_weight_attrs(
177-
qzeros, {
178-
"input_dim": scale_and_zero_input_dim,
179-
"output_dim": 1,
180-
"packed_dim": 1,
181-
"pack_factor": self.quant_config.pack_factor,
182-
})
183-
scales = Parameter(
174+
"weight_loader":
175+
weight_loader
176+
}
177+
weight_scale_args = {
178+
"data":
184179
torch.empty(
185180
scale_and_zero_size,
186181
output_size_per_partition,
187182
dtype=params_dtype,
188183
),
189-
requires_grad=False,
190-
)
191-
set_weight_attrs(scales, {
192-
"input_dim": scale_and_zero_input_dim,
193-
"output_dim": 1,
194-
})
184+
"weight_loader":
185+
weight_loader
186+
}
187+
if scale_and_zero_input_dim is None:
188+
scales = ChannelQuantScaleParameter(output_dim=1,
189+
**weight_scale_args)
190+
qzeros = PackedColumnParameter(
191+
output_dim=1,
192+
packed_dim=1,
193+
packed_factor=self.quant_config.pack_factor,
194+
**qzeros_args)
195+
196+
else:
197+
scales = GroupQuantScaleParameter(output_dim=1,
198+
input_dim=0,
199+
**weight_scale_args)
200+
qzeros = PackedvLLMParameter(
201+
input_dim=0,
202+
output_dim=1,
203+
packed_dim=1,
204+
packed_factor=self.quant_config.pack_factor,
205+
**qzeros_args)
195206

196207
layer.register_parameter("qweight", qweight)
197-
set_weight_attrs(qweight, extra_weight_attrs)
198208
layer.register_parameter("g_idx", g_idx)
199-
set_weight_attrs(g_idx, extra_weight_attrs)
200209
layer.register_parameter("qzeros", qzeros)
201-
set_weight_attrs(qzeros, extra_weight_attrs)
202210
layer.register_parameter("scales", scales)
203-
set_weight_attrs(scales, extra_weight_attrs)
204211

205212
layer.exllama_state = exllama_state
206213

207214
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
215+
# for torch.compile
216+
layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
217+
layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
218+
layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
219+
layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
220+
208221
# exllama needs to shuffle the weight after the weight is loaded
209222
# here we do the shuffle on first forward pass
210223
if layer.exllama_state == ExllamaState.UNINITIALIZED:

vllm/model_executor/layers/vocab_parallel_embedding.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
tensor_model_parallel_all_reduce)
1111
from vllm.model_executor.layers.quantization.base_config import (
1212
QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding)
13+
from vllm.model_executor.parameter import BasevLLMParameter
1314
from vllm.model_executor.utils import set_weight_attrs
1415

1516
DEFAULT_VOCAB_PADDING_SIZE = 64
@@ -370,10 +371,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
370371
# If param packed on the same dim we are sharding on, then
371372
# need to adjust offsets of loaded weight by pack_factor.
372373
if packed_dim is not None and packed_dim == output_dim:
374+
packed_factor = param.packed_factor if isinstance(
375+
param, BasevLLMParameter) else param.pack_factor
373376
assert loaded_weight.shape[output_dim] == (self.org_vocab_size //
374-
param.pack_factor)
375-
start_idx = start_idx // param.pack_factor
376-
shard_size = shard_size // param.pack_factor
377+
param.packed_factor)
378+
start_idx = start_idx // packed_factor
379+
shard_size = shard_size // packed_factor
377380
else:
378381
assert loaded_weight.shape[output_dim] == self.org_vocab_size
379382

vllm/model_executor/parameter.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from fractions import Fraction
12
from typing import Callable, Optional, Union
23

34
import torch
@@ -257,7 +258,7 @@ class PackedColumnParameter(_ColumnvLLMParameter):
257258
"""
258259

259260
def __init__(self,
260-
packed_factor: int,
261+
packed_factor: Union[int, Fraction],
261262
packed_dim: int,
262263
marlin_tile_size: Optional[int] = None,
263264
**kwargs):
@@ -298,7 +299,7 @@ class PackedvLLMParameter(ModelWeightParameter):
298299
"""
299300

300301
def __init__(self,
301-
packed_factor: int,
302+
packed_factor: Union[int, Fraction],
302303
packed_dim: int,
303304
marlin_tile_size: Optional[int] = None,
304305
**kwargs):

0 commit comments

Comments
 (0)