AutoRoundMLLM supports scheme and fix device_map=dict regression (#801)

n1ck-guo · web-flow · commit 78fce450603d · 2025-09-09T12:48:51.000+08:00
diff --git a/README.md b/README.md
@@ -251,22 +251,13 @@ is limited. For more information, please refer to the AutoRoundMLLM [readme](./a
 
 ```python
 from auto_round import AutoRoundMLLM
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
 
-## load the model
-model_name = "Qwen/Qwen2-VL-2B-Instruct"
-model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-
-## quantize the model
-bits, group_size, sym = 4, 128, True
-autoround = AutoRoundMLLM(model, tokenizer, processor, bits=bits, group_size=group_size, sym=sym)
-autoround.quantize()
-
-# save the quantized model, set format='auto_gptq' or 'auto_awq' to use other formats
+# Load the model
+model_name_or_path = "Qwen/Qwen2.5-VL-7B-Instruct"
+# Quantize the model
+ar = AutoRoundMLLM(model_name_or_path, scheme="W4A16")
 output_dir = "./tmp_autoround"
-autoround.save_quantized(output_dir, format="auto_round", inplace=True)
+ar.quantize_and_save(output_dir)
 ```
 
 </details>
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -247,24 +247,7 @@ def __init__(
             device_map = 0
 
         # Set device, must place after model loading
-        if isinstance(device_map, (str, torch.device, int)):
-            self.device = detect_device(device_map)
-
-        elif isinstance(device_map, dict) and device_map:
-            tmp_devices = []
-            for val in device_map.values():
-                if isinstance(val, (str, torch.device, int)):  # could optimize
-                    tmp_device = detect_device(self.device_map)
-                    tmp_device = tmp_device.split(":")[0]
-                    tmp_devices.append(tmp_device)
-            tmp_devices = list(set(tmp_devices))
-            if len(tmp_devices) > 1:
-                logger.warning(
-                    f"there are multiple device types in the device_map, "
-                    f"please make sure they are correct,use the first device {tmp_devices[0]} as the core device "
-                )
-
-            self.device = tmp_devices[0]
+        self._set_device(device_map)
 
         if (isinstance(device_map, dict) and device_map) or device_map == "auto":
             self.device_map = device_map
@@ -386,6 +369,30 @@ def __init__(
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
+    def _set_device(self, device_map):
+        if hasattr(self, "device") and self.device is not None:
+            return
+        if isinstance(device_map, (str, torch.device, int)):
+            self.device = detect_device(device_map)
+
+        elif isinstance(device_map, dict) and device_map:
+            tmp_devices = []
+            for val in device_map.values():
+                if isinstance(val, (str, torch.device, int)):  # could optimize
+                    tmp_device = detect_device(val)
+                    tmp_device = tmp_device.split(":")[0]
+                    tmp_devices.append(tmp_device)
+            tmp_devices = list(set(tmp_devices))
+            if len(tmp_devices) > 1:
+                logger.warning(
+                    f"there are multiple device types in the device_map, "
+                    f"please make sure they are correct,use the first device {tmp_devices[0]} as the core device "
+                )
+
+            self.device = tmp_devices[0]
+        else:
+            raise TypeError(f"device_map should be [str, torch.device, int, dict], but got {type(device_map)}")
+
     def _parse_layer_config(self, layer_config: dict[str, Union[str, dict, QuantizationScheme]]) -> None:
         """Parse and set the layer-wise quantization configuration."""
         # Some other quantization configs
diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py
@@ -22,6 +22,7 @@
 from auto_round.low_cpu_mem.utils import get_layers_before_block
 from auto_round.mllm.mllm_dataset import get_mllm_dataloader
 from auto_round.mllm.template import Template, get_template
+from auto_round.schemes import QuantizationScheme
 from auto_round.special_model_handler import (
     NOT_SUPPORT_ONLY_TEXT_MODELS,
     SUPPORT_ONLY_TEXT_MODELS,
@@ -126,61 +127,56 @@ class AutoRoundMLLM(AutoRound):
 
     """
 
+    bits: int | None
+    group_size: int | None
+    sym: bool | None
+    data_type: str | None
+    act_bits: int | None
+    act_group_size: int | None
+    act_sym: bool | None
+    act_data_type: str | None
+    act_dynamic: bool | None
+    super_bits: int | None
+    super_group_size: int | None
+
     def __init__(
         self,
         model: Union[torch.nn.Module, str],
         tokenizer=None,
         processor=None,
         image_processor=None,
-        bits: int = 4,
-        group_size: int = 128,
-        sym: bool = True,
-        layer_config: dict = None,
-        batch_size: int = 8,
-        amp: bool = True,
-        device: Union[str, torch.device, int] = 0,
-        lr_scheduler=None,
-        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = None,
-        extra_data_dir: str = None,
-        template: Union[str, Template] = None,
+        scheme: Union[str, dict, QuantizationScheme] = "W4A16",
+        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
+        dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         quant_nontext_module: bool = False,
-        enable_quanted_input: bool = True,
-        enable_minmax_tuning: bool = True,
-        lr: float = None,
-        minmax_lr: float = None,
-        low_gpu_mem_usage: bool = False,
-        low_cpu_mem_usage: bool = False,
         iters: int = 200,
-        seqlen: int = None,
+        seqlen: int = 2048,
         nsamples: int = 128,
-        sampler: str = "rand",
-        seed: int = 42,
-        nblocks: int = 1,
+        batch_size: int = 8,
         gradient_accumulate_steps: int = 1,
-        not_use_best_mse: bool = False,
-        dynamic_max_gap: int = -1,
-        data_type: str = "int",
-        scale_dtype: str = "fp16",
-        act_bits: int = 32,
-        act_group_size: int = None,
-        act_sym: bool = None,
-        act_dynamic: bool = True,
-        to_quant_block_names: Union[str, list] = None,
-        enable_norm_bias_tuning: bool = False,
-        truncation: bool = None,
+        low_gpu_mem_usage: bool = False,
+        device_map: Union[str, torch.device, int, dict] = 0,
         enable_torch_compile: bool = False,
-        model_kwargs: dict = None,
+        seed: int = 42,
         **kwargs,
     ):
+        extra_data_dir = kwargs.pop("extra_data_dir", None)
+        template = kwargs.pop("template", None)
+
+        to_quant_block_names: Union[str, list, None] = kwargs.pop("to_quant_block_names", None)
+        if device_map is None:
+            device_map = 0
+        self._set_device(device_map)
+
         if isinstance(model, str):
-            model, processor, tokenizer, image_processor = mllm_load_model(model, device=device)
+            model, processor, tokenizer, image_processor = mllm_load_model(model, device=self.device)
 
+        self.model = model
         quant_nontext_module = self._check_quant_nontext(layer_config, quant_nontext_module)
         all_blocks = get_block_names(model, quant_nontext_module)
         self.quant_block_list = find_matching_blocks(model, all_blocks, to_quant_block_names)
         if to_quant_block_names is None:
             to_quant_block_names = extract_block_names_to_str(self.quant_block_list)
-        self.to_quant_block_names = to_quant_block_names
         self.extra_data_dir = extra_data_dir
         self.quant_nontext_module = quant_nontext_module
         self.processor = processor
@@ -219,7 +215,7 @@ def __init__(
                     " switching to liuhaotian/llava_conv_58k"
                 )
                 dataset = "liuhaotian/llava_conv_58k"
-            elif not _only_text_test(model, tokenizer, device, self.template.model_type):
+            elif not _only_text_test(model, tokenizer, self.device, self.template.model_type):
                 logger.warning(
                     f"{model.config.model_type} does not support for {dataset},"
                     " will use liuhaotian/llava_conv_58k with default config as an alternative."
@@ -248,7 +244,7 @@ def __init__(
             gradient_accumulate_steps = batch_size * gradient_accumulate_steps
             batch_size = 1
         seqlen = 2048 if seqlen is None else seqlen
-        truncation = True if truncation is None else truncation
+        truncation = True
         self.truncation = truncation
 
         if nsamples % batch_size != 0:
@@ -258,40 +254,20 @@ def __init__(
         super(AutoRoundMLLM, self).__init__(
             model=model,
             tokenizer=tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
+            scheme=scheme,
             layer_config=layer_config,
-            batch_size=batch_size,
-            amp=amp,
-            device=device,
-            lr_scheduler=lr_scheduler,
             dataset=dataset,
-            enable_quanted_input=enable_quanted_input,
-            enable_minmax_tuning=enable_minmax_tuning,
-            lr=lr,
-            minmax_lr=minmax_lr,
-            low_gpu_mem_usage=low_gpu_mem_usage,
-            low_cpu_mem_usage=low_cpu_mem_usage,
             iters=iters,
             seqlen=seqlen,
             nsamples=nsamples,
-            sampler=sampler,
-            seed=seed,
-            nblocks=nblocks,
+            batch_size=batch_size,
             gradient_accumulate_steps=gradient_accumulate_steps,
-            not_use_best_mse=not_use_best_mse,
-            dynamic_max_gap=dynamic_max_gap,
-            data_type=data_type,
-            scale_dtype=scale_dtype,
-            act_bits=act_bits,
-            act_group_size=act_group_size,
-            act_sym=act_sym,
-            act_dynamic=act_dynamic,
-            to_quant_block_names=self.to_quant_block_names,
-            enable_norm_bias_tuning=enable_norm_bias_tuning,
+            low_gpu_mem_usage=low_gpu_mem_usage,
+            device_map=device_map,
             enable_torch_compile=enable_torch_compile,
+            seed=seed,
             vlm=True,
+            to_quant_block_names=to_quant_block_names,
             **kwargs,
         )
 
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
@@ -59,7 +59,7 @@ def __init__(self, *args, **kwargs):
             "--scheme",
             default="W4A16",
             type=str,
-            # choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FPW8_STATIC"],
+            # choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"],
             help="quantization scheme",
         )
 
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
@@ -4,11 +4,9 @@
 
 import torch
 
-from auto_round.schemes import QuantizationScheme
-
 sys.path.insert(0, "../..")
-
 from auto_round import AutoRound
+from auto_round.schemes import QuantizationScheme
 
 
 class LLMDataLoader:
@@ -56,6 +54,14 @@ def test_mxfp4(self):
         self.assertEqual(ar.act_data_type, "mx_fp_rceil")
         ar.quantize()
 
+    def test_vllm(self):
+        from auto_round import AutoRoundMLLM
+
+        ar = AutoRoundMLLM("Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+        self.assertEqual(ar.bits, 2)
+        self.assertEqual(ar.act_bits, 16)
+        ar.quantize()
+
     def test_scheme_in_layer_config(self):
         layer_config = {
             "model.decoder.layers.2.self_attn": {"bits": 2},
diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
@@ -308,6 +308,33 @@ def test_device_map_for_triton(self):
             del model
             torch.cuda.empty_cache()
 
+    @multi_card
+    def test_mllm_device_map(self):
+        model_name = "/models/Qwen2-VL-2B-Instruct/"
+        from auto_round import AutoRoundMLLM
+
+        device_map = "0,1"
+        ar = AutoRoundMLLM(model_name, device_map=device_map)
+        self.assertEqual(ar.device, "cuda:0")
+        self.assertEqual(ar.device_map, "auto")
+        self.assertEqual(ar.device_list, [0, 1])
+
+        device_map = 1
+        ar = AutoRoundMLLM(ar.model, ar.tokenizer, ar.processor, device_map=device_map)
+        self.assertEqual(ar.device, "cuda:1")
+        self.assertEqual(ar.device_map, None)
+        self.assertFalse(hasattr(ar, "device_list"))
+
+        device_map = "auto"
+        ar = AutoRoundMLLM(ar.model, ar.tokenizer, ar.processor, device_map=device_map)
+        self.assertEqual(ar.device, "cuda")
+        self.assertEqual(ar.device_map, "auto")
+
+        device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1}
+        ar = AutoRoundMLLM(ar.model, ar.tokenizer, ar.processor, device_map=device_map)
+        self.assertEqual(ar.model.model.language_model.layers.tuning_device, "cuda:0")
+        self.assertEqual(ar.model.model.visual.blocks.tuning_device, "cuda:1")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
@@ -46,7 +46,7 @@ def test_mxfp4(self):
         ar.quantize()
 
     def test_fp8_static(self):
-        ar = AutoRound(self.model_name, scheme="FPW8_STATIC", nsamples=1, iters=1)
+        ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1)
         self.assertEqual(ar.bits, 8)
         self.assertEqual(ar.act_bits, 8)
         self.assertEqual(ar.data_type, "fp")
@@ -70,7 +70,7 @@ def test_mxfp4_rtn(self):
         ar.quantize()
 
     def test_fp8_static_rtn(self):
-        ar = AutoRound(self.model_name, scheme="FPW8_STATIC", nsamples=1, iters=0)
+        ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0)
         self.assertEqual(ar.bits, 8)
         self.assertEqual(ar.act_bits, 8)
         self.assertEqual(ar.data_type, "fp")

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def __init__(self, args, *kwargs):`
`59`	`59`	`"--scheme",`
`60`	`60`	`default="W4A16",`
`61`	`61`	`type=str,`
`62`		`- # choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FPW8_STATIC"],`
	`62`	`+ # choices=["W4A16", "W2A16", "W3A16", "W8A16", "MXFP4", "MXFP8", "NVFP4", "FPW8A16", "FP8_STATIC"],`
`63`	`63`	`help="quantization scheme",`
`64`	`64`	`)`
`65`	`65`