intel · n1ck-guo · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -678,32 +678,7 @@ def tune(args):
         trust_remote_code=not args.disable_trust_remote_code,
     )
 
-    model_name = args.model.rstrip("/")
-
-    if model_name.split("/")[-1].strip(".") == "" and "gguf" not in args.format:
-        if autoround.group_size <= 0:
-            if "fp" in autoround.act_data_type:
-                suffix = f"afp{autoround.act_bits}"
-            else:
-                suffix = f"a{autoround.act_bits}"
-        else:
-            suffix = f"g{autoround.group_size}"
-        export_dir = os.path.join(args.output_dir, f"w{autoround.bits}{suffix}")
-    elif model_name.split("/")[-1].strip(".") == "" and "gguf" in args.format:
-        export_dir = args.output_dir
-    elif model_name.split("./")[-1].strip("./") != "" and "gguf" in args.format:
-        export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + "-gguf")
-    else:
-        if autoround.group_size <= 0:
-            if "fp" in autoround.act_data_type:
-                suffix = f"afp{autoround.act_bits}"
-            else:
-                suffix = f"a{autoround.act_bits}"
-        else:
-            suffix = f"g{autoround.group_size}"
-        export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}")
-
-    model, folders = autoround.quantize_and_save(export_dir, format=args.format)  # pylint: disable=E1101
+    model, folders = autoround.quantize_and_save(args.output_dir, format=args.format)  # pylint: disable=E1101
     tokenizer = autoround.tokenizer  # pylint: disable=E1101
 
     model.eval()

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -280,15 +280,8 @@ def __init__(
         self.shared_cache_keys = get_shared_keys(self.model)
 
         self.layer_config = layer_config
-
-        # should be set after loading model and set layer_config, cause some special scheme need these.
-        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(scheme, kwargs)
-
-        gguf_scheme_name = get_gguf_scheme(self.scheme)
-        # GGUF uses fp32 scale dtype as default
-        scale_dtype = kwargs.pop("scale_dtype", None)
-        if scale_dtype is None:
-            scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        self.scheme = scheme
+        self.scale_dtype = kwargs.pop("scale_dtype", None)
 
         # Extra/legacy kwargs for backward compatibility
         # Major version releases may pack them with extra configuration options
@@ -314,21 +307,12 @@ def __init__(
             platform = "model_scope"
         self.platform = platform
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
-
         self.ignore_layers = kwargs.pop("ignore_layers", "")
-        predefined_ignore_layers = get_predefined_ignore_layers(self.model)
 
-        if predefined_ignore_layers:
-            logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}")
-            tmp_str = ",".join(predefined_ignore_layers)
-            if self.ignore_layers == "":
-                self.ignore_layers = tmp_str
-            else:
-                self.ignore_layers += "," + tmp_str
         self.supported_types = SUPPORTED_LAYER_TYPES
         self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
-        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.block_forward = block_forward
 
         if kwargs:
             logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")
@@ -360,16 +344,10 @@ def __init__(
         self.device_map = device_map
         if isinstance(self.device_map, str):
             self.device_map = self.device_map.replace(" ", "")
-
-        self.device_list = parse_available_devices(device_map)
-
-        # Set device, must place after model loading
-        self.device = get_major_device(device_map)
-        set_non_auto_device_map(self.model, self.device_map)
+        self.device = get_major_device(self.device_map)
 
         # Tuning hyperparameters
         self.seed = seed
-        set_seed(self.seed)
         self.amp = amp
         self.enable_quanted_input = enable_quanted_input
         self.enable_minmax_tuning = enable_minmax_tuning
@@ -404,24 +382,7 @@ def __init__(
         if enable_opt_rtn:
             disable_opt_rtn = False
         self.orig_disable_opt_rtn = disable_opt_rtn
-
-        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
-            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
-            disable_opt_rtn = True
-        if (
-            self.bits >= 8
-            and self.act_bits >= 16
-            and self.iters == 0
-            and self.data_type == "int"
-            and disable_opt_rtn is None
-        ):
-            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
-            disable_opt_rtn = True
-        if disable_opt_rtn is None and self.iters == 0:
-            logger.info(
-                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
-            )
-            disable_opt_rtn = False
+        self.disable_opt_rtn = disable_opt_rtn
 
         # Important Note! This is not very robust, do NOT rely on it to do high risky thing
         self.is_moe_model = is_moe_model(self.model)
@@ -432,7 +393,6 @@ def __init__(
         self.dynamic_max_gap = dynamic_max_gap
         self.lr_scheduler = lr_scheduler
         self.optimizer = self._get_optimizer(None)
-        self.disable_opt_rtn = disable_opt_rtn
 
         # Whether to pack the layer immediately after tuning
         self.is_immediate_packing = False
@@ -448,8 +408,74 @@ def __init__(
         if self.static_attention_dtype is not None:
             logger.warning("The static attention dtype is experimental and currently has limited support.")
 
-        self._set_amp_dtype()
         self.cache_device = torch.device("cpu") if self.low_gpu_mem_usage else self.device
+
+        self.batch_dim = None
+        self.infer_bs_coeff = 1
+
+        # after setting iters
+        self.enable_torch_compile = enable_torch_compile
+
+        self.attention_mask = []
+        self.wrapper_block = wrapper_block
+
+        torch.set_printoptions(precision=3, sci_mode=True)
+
+        self._post_inited = False
+
+    def _post_init(self) -> None:
+        """Post-initialization for AutoRound."""
+        if self._post_inited:
+            return
+
+        # should be set after loading model and set layer_config, cause some special scheme need these.
+        self.scheme, self.is_auto_scheme = self._parse_and_set_scheme(self.scheme, {})
+
+        # GGUF uses fp32 scale dtype as default
+        if self.scale_dtype is None:
+            gguf_scheme_name = get_gguf_scheme(self.scheme)
+            scale_dtype = "fp32" if gguf_scheme_name else "fp16"
+        self.scale_dtype = convert_dtype_str2torch(scale_dtype)
+
+        predefined_ignore_layers = get_predefined_ignore_layers(self.model)
+
+        if predefined_ignore_layers:
+            logger.info(f"Using predefined ignore_layers: {predefined_ignore_layers}")
+            tmp_str = ",".join(predefined_ignore_layers)
+            if self.ignore_layers == "":
+                self.ignore_layers = tmp_str
+            else:
+                self.ignore_layers += "," + tmp_str
+
+        # Set device, must place after model loading
+        self._set_device(self.device_map)
+        set_non_auto_device_map(self.model, self.device_map)
+        self.device_list = parse_available_devices(self.device_map)
+
+        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
+            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
+            self.disable_opt_rtn = True
+        if (
+            self.bits >= 8
+            and self.act_bits >= 16
+            and self.iters == 0
+            and self.data_type == "int"
+            and self.disable_opt_rtn is None
+        ):
+            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
+            self.disable_opt_rtn = True
+        if self.disable_opt_rtn is None and self.iters == 0:
+            logger.info(
+                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
+            )
+            self.disable_opt_rtn = False
+
+        set_seed(self.seed)
+        self._set_amp_dtype()
+        self._adjust_torch_compile(self.enable_torch_compile)
+        if self.enable_torch_compile:
+            self.block_forward = compile_func(self.block_forward, self.device)
+
         if self.act_bits <= 8 and self.amp_dtype == torch.float16:
             logger.warning("force to use bf16 to for quantization tuning when enabling activation quantization")
             self.amp_dtype = torch.bfloat16
@@ -461,28 +487,17 @@ def __init__(
         # Some helpers
         if "hpu" in str(self.device):
             self.inner_supported_types = tuple(x for x in INNER_SUPPORTED_LAYER_TYPES if x != "FP8Linear")
-        self.batch_dim = None
-        self.infer_bs_coeff = 1
-
-        # after setting iters
-        self.enable_torch_compile = enable_torch_compile
-        self._adjust_torch_compile(enable_torch_compile)
 
-        self.block_forward = compile_func(block_forward, self.device) if self.enable_torch_compile else block_forward
         self._check_configs()
-        torch.set_printoptions(precision=3, sci_mode=True)
 
-        if isinstance(scheme, AutoScheme):
-            self.layer_config = self._gen_auto_scheme(model, scheme, dataset, self.device_map)
+        if isinstance(self.scheme, AutoScheme):
+            self.layer_config = self._gen_auto_scheme(self.model, self.scheme, self.dataset, self.device_map)
 
         if is_hpex_available():
             logger.info("habana_frameworks is available, import htcore explicitly.")
             import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
             import habana_frameworks.torch.hpu as hthpu  # pylint: disable=E0401]
 
-        self.attention_mask = []
-
-        self.wrapper_block = wrapper_block
         if self.enable_alg_ext:
             try:
                 logger.warning_once("using algorithm extension for quantization.")
@@ -491,6 +506,7 @@ def __init__(
                 wrapper_autoround(self)
             except (ImportError, ModuleNotFoundError):
                 logger.error("algorithm extension import error, fallback to default mode")
+        self._post_inited = True
 
     def _gen_auto_scheme(
         self, model: torch.nn.Module, scheme: AutoScheme, dataset: str, device_map: Union[str, int, dict, torch.device]
@@ -865,6 +881,34 @@ def quantize_and_save(
         Raises:
             ValueError: If an unsupported format is specified.
         """
+        # post init
+        self._post_init()
+
+        model_name = self.model.name_or_path.rstrip("/")
+        if model_name.split("/")[-1].strip(".") == "" and "gguf" not in format:
+            if self.group_size <= 0:
+                if "fp" in self.act_data_type:
+                    suffix = f"afp{self.act_bits}"
+                else:
+                    suffix = f"a{self.act_bits}"
+            else:
+                suffix = f"g{self.group_size}"
+            export_dir = os.path.join(output_dir, f"w{self.bits}{suffix}")
+        elif model_name.split("/")[-1].strip(".") == "" and "gguf" in format:
+            export_dir = output_dir
+        elif model_name.split("/")[-1].strip(".") != "" and "gguf" in format:
+            export_dir = os.path.join(output_dir, model_name.split("/")[-1] + "-gguf")
+        else:
+            if self.group_size <= 0:
+                if "fp" in self.act_data_type:
+                    suffix = f"afp{self.act_bits}"
+                else:
+                    suffix = f"a{self.act_bits}"
+            else:
+                suffix = f"g{self.group_size}"
+            export_dir = os.path.join(output_dir, model_name.split("/")[-1] + f"-w{self.bits}{suffix}")
+
+        output_dir = export_dir
         # Validate and process the specified formats
         self.orig_output_dir = output_dir
 
@@ -3118,6 +3162,9 @@ def save_quantized(
         Returns:
             object: The compressed model object.
         """
+        # post init
+        self._post_init()
+
         self.orig_output_dir = output_dir
         if isinstance(format, str) and getattr(self, "formats", None) is None:
             formats = get_formats(format, self)