Skip to content

Commit 78fce45

Browse files
authored
AutoRoundMLLM supports scheme and fix device_map=dict regression (#801)
1 parent da089e9 commit 78fce45

File tree

7 files changed

+108
-101
lines changed

7 files changed

+108
-101
lines changed

README.md

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -251,22 +251,13 @@ is limited. For more information, please refer to the AutoRoundMLLM [readme](./a
251251

252252
```python
253253
from auto_round import AutoRoundMLLM
254-
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
255254

256-
## load the model
257-
model_name = "Qwen/Qwen2-VL-2B-Instruct"
258-
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")
259-
tokenizer = AutoTokenizer.from_pretrained(model_name)
260-
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
261-
262-
## quantize the model
263-
bits, group_size, sym = 4, 128, True
264-
autoround = AutoRoundMLLM(model, tokenizer, processor, bits=bits, group_size=group_size, sym=sym)
265-
autoround.quantize()
266-
267-
# save the quantized model, set format='auto_gptq' or 'auto_awq' to use other formats
255+
# Load the model
256+
model_name_or_path = "Qwen/Qwen2.5-VL-7B-Instruct"
257+
# Quantize the model
258+
ar = AutoRoundMLLM(model_name_or_path, scheme="W4A16")
268259
output_dir = "./tmp_autoround"
269-
autoround.save_quantized(output_dir, format="auto_round", inplace=True)
260+
ar.quantize_and_save(output_dir)
270261
```
271262

272263
</details>

auto_round/autoround.py

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -247,24 +247,7 @@ def __init__(
247247
device_map = 0
248248

249249
# Set device, must place after model loading
250-
if isinstance(device_map, (str, torch.device, int)):
251-
self.device = detect_device(device_map)
252-
253-
elif isinstance(device_map, dict) and device_map:
254-
tmp_devices = []
255-
for val in device_map.values():
256-
if isinstance(val, (str, torch.device, int)): # could optimize
257-
tmp_device = detect_device(self.device_map)
258-
tmp_device = tmp_device.split(":")[0]
259-
tmp_devices.append(tmp_device)
260-
tmp_devices = list(set(tmp_devices))
261-
if len(tmp_devices) > 1:
262-
logger.warning(
263-
f"there are multiple device types in the device_map, "
264-
f"please make sure they are correct,use the first device {tmp_devices[0]} as the core device "
265-
)
266-
267-
self.device = tmp_devices[0]
250+
self._set_device(device_map)
268251

269252
if (isinstance(device_map, dict) and device_map) or device_map == "auto":
270253
self.device_map = device_map
@@ -386,6 +369,30 @@ def __init__(
386369
import habana_frameworks.torch.core as htcore # pylint: disable=E0401
387370
import habana_frameworks.torch.hpu as hthpu # pylint: disable=E0401]
388371

372+
def _set_device(self, device_map):
373+
if hasattr(self, "device") and self.device is not None:
374+
return
375+
if isinstance(device_map, (str, torch.device, int)):
376+
self.device = detect_device(device_map)
377+
378+
elif isinstance(device_map, dict) and device_map:
379+
tmp_devices = []
380+
for val in device_map.values():
381+
if isinstance(val, (str, torch.device, int)): # could optimize
382+
tmp_device = detect_device(val)
383+
tmp_device = tmp_device.split(":")[0]
384+
tmp_devices.append(tmp_device)
385+
tmp_devices = list(set(tmp_devices))
386+
if len(tmp_devices) > 1:
387+
logger.warning(
388+
f"there are multiple device types in the device_map, "
389+
f"please make sure they are correct,use the first device {tmp_devices[0]} as the core device "
390+
)
391+
392+
self.device = tmp_devices[0]
393+
else:
394+
raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
395+
389396
def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> None:
390397
"""Parse and set the layer-wise quantization configuration."""
391398
# Some other quantization configs

auto_round/mllm/autoround_mllm.py

Lines changed: 39 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from auto_round.low_cpu_mem.utils import get_layers_before_block
2323
from auto_round.mllm.mllm_dataset import get_mllm_dataloader
2424
from auto_round.mllm.template import Template, get_template
25+
from auto_round.schemes import QuantizationScheme
2526
from auto_round.special_model_handler import (
2627
NOT_SUPPORT_ONLY_TEXT_MODELS,
2728
SUPPORT_ONLY_TEXT_MODELS,
@@ -126,61 +127,56 @@ class AutoRoundMLLM(AutoRound):
126127
127128
"""
128129

130+
bits: int | None
131+
group_size: int | None
132+
sym: bool | None
133+
data_type: str | None
134+
act_bits: int | None
135+
act_group_size: int | None
136+
act_sym: bool | None
137+
act_data_type: str | None
138+
act_dynamic: bool | None
139+
super_bits: int | None
140+
super_group_size: int | None
141+
129142
def __init__(
130143
self,
131144
model: Union[torch.nn.Module, str],
132145
tokenizer=None,
133146
processor=None,
134147
image_processor=None,
135-
bits: int = 4,
136-
group_size: int = 128,
137-
sym: bool = True,
138-
layer_config: dict = None,
139-
batch_size: int = 8,
140-
amp: bool = True,
141-
device: Union[str, torch.device, int] = 0,
142-
lr_scheduler=None,
143-
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = None,
144-
extra_data_dir: str = None,
145-
template: Union[str, Template] = None,
148+
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
149+
layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
150+
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
146151
quant_nontext_module: bool = False,
147-
enable_quanted_input: bool = True,
148-
enable_minmax_tuning: bool = True,
149-
lr: float = None,
150-
minmax_lr: float = None,
151-
low_gpu_mem_usage: bool = False,
152-
low_cpu_mem_usage: bool = False,
153152
iters: int = 200,
154-
seqlen: int = None,
153+
seqlen: int = 2048,
155154
nsamples: int = 128,
156-
sampler: str = "rand",
157-
seed: int = 42,
158-
nblocks: int = 1,
155+
batch_size: int = 8,
159156
gradient_accumulate_steps: int = 1,
160-
not_use_best_mse: bool = False,
161-
dynamic_max_gap: int = -1,
162-
data_type: str = "int",
163-
scale_dtype: str = "fp16",
164-
act_bits: int = 32,
165-
act_group_size: int = None,
166-
act_sym: bool = None,
167-
act_dynamic: bool = True,
168-
to_quant_block_names: Union[str, list] = None,
169-
enable_norm_bias_tuning: bool = False,
170-
truncation: bool = None,
157+
low_gpu_mem_usage: bool = False,
158+
device_map: Union[str, torch.device, int, dict] = 0,
171159
enable_torch_compile: bool = False,
172-
model_kwargs: dict = None,
160+
seed: int = 42,
173161
**kwargs,
174162
):
163+
extra_data_dir = kwargs.pop("extra_data_dir", None)
164+
template = kwargs.pop("template", None)
165+
166+
to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None)
167+
if device_map is None:
168+
device_map = 0
169+
self._set_device(device_map)
170+
175171
if isinstance(model, str):
176-
model, processor, tokenizer, image_processor = mllm_load_model(model, device=device)
172+
model, processor, tokenizer, image_processor = mllm_load_model(model, device=self.device)
177173

174+
self.model = model
178175
quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module)
179176
all_blocks = get_block_names(model, quant_nontext_module)
180177
self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
181178
if to_quant_block_names is None:
182179
to_quant_block_names = extract_block_names_to_str(self.quant_block_list)
183-
self.to_quant_block_names = to_quant_block_names
184180
self.extra_data_dir = extra_data_dir
185181
self.quant_nontext_module = quant_nontext_module
186182
self.processor = processor
@@ -219,7 +215,7 @@ def __init__(
219215
" switching to liuhaotian/llava_conv_58k"
220216
)
221217
dataset = "liuhaotian/llava_conv_58k"
222-
elif not _only_text_test(model, tokenizer, device, self.template.model_type):
218+
elif not _only_text_test(model, tokenizer, self.device, self.template.model_type):
223219
logger.warning(
224220
f"{model.config.model_type} does not support for {dataset},"
225221
" will use liuhaotian/llava_conv_58k with default config as an alternative."
@@ -248,7 +244,7 @@ def __init__(
248244
gradient_accumulate_steps = batch_size * gradient_accumulate_steps
249245
batch_size = 1
250246
seqlen = 2048 if seqlen is None else seqlen
251-
truncation = True if truncation is None else truncation
247+
truncation = True
252248
self.truncation = truncation
253249

254250
if nsamples % batch_size != 0:
@@ -258,40 +254,20 @@ def __init__(
258254
super(AutoRoundMLLM, self).__init__(
259255
model=model,
260256
tokenizer=tokenizer,
261-
bits=bits,
262-
group_size=group_size,
263-
sym=sym,
257+
scheme=scheme,
264258
layer_config=layer_config,
265-
batch_size=batch_size,
266-
amp=amp,
267-
device=device,
268-
lr_scheduler=lr_scheduler,
269259
dataset=dataset,
270-
enable_quanted_input=enable_quanted_input,
271-
enable_minmax_tuning=enable_minmax_tuning,
272-
lr=lr,
273-
minmax_lr=minmax_lr,
274-
low_gpu_mem_usage=low_gpu_mem_usage,
275-
low_cpu_mem_usage=low_cpu_mem_usage,
276260
iters=iters,
277261
seqlen=seqlen,
278262
nsamples=nsamples,
279-
sampler=sampler,
280-
seed=seed,
281-
nblocks=nblocks,
263+
batch_size=batch_size,
282264
gradient_accumulate_steps=gradient_accumulate_steps,
283-
not_use_best_mse=not_use_best_mse,
284-
dynamic_max_gap=dynamic_max_gap,
285-
data_type=data_type,
286-
scale_dtype=scale_dtype,
287-
act_bits=act_bits,
288-
act_group_size=act_group_size,
289-
act_sym=act_sym,
290-
act_dynamic=act_dynamic,
291-
to_quant_block_names=self.to_quant_block_names,
292-
enable_norm_bias_tuning=enable_norm_bias_tuning,
265+
low_gpu_mem_usage=low_gpu_mem_usage,
266+
device_map=device_map,
293267
enable_torch_compile=enable_torch_compile,
268+
seed=seed,
294269
vlm=True,
270+
to_quant_block_names=to_quant_block_names,
295271
**kwargs,
296272
)
297273

auto_round/script/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def __init__(self, *args, **kwargs):
5959
"--scheme",
6060
default="W4A16",
6161
type=str,
62-
# choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FPW8_STATIC"],
62+
# choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"],
6363
help="quantization scheme",
6464
)
6565

test/test_cpu/test_scheme.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@
44

55
import torch
66

7-
from auto_round.schemes import QuantizationScheme
8-
97
sys.path.insert(0, "../..")
10-
118
from auto_round import AutoRound
9+
from auto_round.schemes import QuantizationScheme
1210

1311

1412
class LLMDataLoader:
@@ -56,6 +54,14 @@ def test_mxfp4(self):
5654
self.assertEqual(ar.act_data_type, "mx_fp_rceil")
5755
ar.quantize()
5856

57+
def test_vllm(self):
58+
from auto_round import AutoRoundMLLM
59+
60+
ar = AutoRoundMLLM("Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2)
61+
self.assertEqual(ar.bits, 2)
62+
self.assertEqual(ar.act_bits, 16)
63+
ar.quantize()
64+
5965
def test_scheme_in_layer_config(self):
6066
layer_config = {
6167
"model.decoder.layers.2.self_attn": {"bits": 2},

test/test_cuda/test_multiple_card.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,33 @@ def test_device_map_for_triton(self):
308308
del model
309309
torch.cuda.empty_cache()
310310

311+
@multi_card
312+
def test_mllm_device_map(self):
313+
model_name = "/models/Qwen2-VL-2B-Instruct/"
314+
from auto_round import AutoRoundMLLM
315+
316+
device_map = "0,1"
317+
ar = AutoRoundMLLM(model_name, device_map=device_map)
318+
self.assertEqual(ar.device, "cuda:0")
319+
self.assertEqual(ar.device_map, "auto")
320+
self.assertEqual(ar.device_list, [0, 1])
321+
322+
device_map = 1
323+
ar = AutoRoundMLLM(ar.model, ar.tokenizer, ar.processor, device_map=device_map)
324+
self.assertEqual(ar.device, "cuda:1")
325+
self.assertEqual(ar.device_map, None)
326+
self.assertFalse(hasattr(ar, "device_list"))
327+
328+
device_map = "auto"
329+
ar = AutoRoundMLLM(ar.model, ar.tokenizer, ar.processor, device_map=device_map)
330+
self.assertEqual(ar.device, "cuda")
331+
self.assertEqual(ar.device_map, "auto")
332+
333+
device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1}
334+
ar = AutoRoundMLLM(ar.model, ar.tokenizer, ar.processor, device_map=device_map)
335+
self.assertEqual(ar.model.model.language_model.layers.tuning_device, "cuda:0")
336+
self.assertEqual(ar.model.model.visual.blocks.tuning_device, "cuda:1")
337+
311338

312339
if __name__ == "__main__":
313340
unittest.main()

test/test_cuda/test_scheme.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def test_mxfp4(self):
4646
ar.quantize()
4747

4848
def test_fp8_static(self):
49-
ar = AutoRound(self.model_name, scheme="FPW8_STATIC", nsamples=1, iters=1)
49+
ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1)
5050
self.assertEqual(ar.bits, 8)
5151
self.assertEqual(ar.act_bits, 8)
5252
self.assertEqual(ar.data_type, "fp")
@@ -70,7 +70,7 @@ def test_mxfp4_rtn(self):
7070
ar.quantize()
7171

7272
def test_fp8_static_rtn(self):
73-
ar = AutoRound(self.model_name, scheme="FPW8_STATIC", nsamples=1, iters=0)
73+
ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0)
7474
self.assertEqual(ar.bits, 8)
7575
self.assertEqual(ar.act_bits, 8)
7676
self.assertEqual(ar.data_type, "fp")

0 commit comments

Comments
 (0)