modified qbits to ark

Zhenzhong1 · Zhenzhong1 · commit 212b855919dd · 2025-11-26T23:37:15.000-08:00
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -47,11 +47,11 @@ class BackendInfo:
 
     Attributes:
         device: A list of strings representing the devices the backend supports
-            (e.g., 'cuda', 'cpu').
+            (e.g., 'cpu', 'xpu', 'cuda').
         sym: A list of booleans indicating whether the backend supports symmetric
             quantization for weights (True if symmetric, False if not).
         packing_format: A list of strings representing the packing formats used by the backend
-            (e.g., 'triton', 'qbits').
+            (e.g., 'ark', 'triton').
         bits: A list of integers specifying the bit-widths supported by the backend
             for weight quantization (e.g., [2, 4, 8]).
         group_size: An optional list of integers specifying the group sizes supported
@@ -430,51 +430,51 @@ def fp8_static_scheme_checker(
     requirements=["autoawq", "transformers<4.57.0"],
 )
 
-BackendInfos["qbits"] = BackendInfo(
-    device=["cpu"],
-    sym=[True, False],
+BackendInfos["auto_round_kernel"] = BackendInfo(
+    device=["cpu", "xpu"],
+    sym=[True],
     packing_format=GPTQ_FORMAT_NO_ZP,
     bits=[2, 4, 8],
     group_size=None,
-    priority=1,
+    priority=0,
     checkers=[],
-    alias=["itrex", "qbits"],
-    compute_dtype=["float16", "bfloat16"],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    requirements=["torch<2.7.0", "intel-extension-for-transformers"],
+    requirements=["torch>=2.9.0"],
 )
 
-BackendInfos["qbits_zp"] = BackendInfo(
-    device=["cpu"],
-    sym=[True, False],
+BackendInfos["auto_round_kernel_zp"] = BackendInfo(
+    device=["cpu", "xpu"],
+    sym=[True],
     packing_format=GPTQ_FORMAT,
     bits=[2, 4, 8],
     group_size=None,
-    compute_dtype=["float16", "bfloat16"],
+    priority=0,
+    checkers=[],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    priority=1,
-    checkers=[],
-    alias=["itrex", "qbits"],
-    requirements=["torch<2.7.0", "intel-extension-for-transformers"],
+    requirements=["torch>=2.9.0"],
 )
 
-
-BackendInfos["qbits_awq"] = BackendInfo(
+BackendInfos["auto_round_kernel_awq"] = BackendInfo(
     device=["cpu"],
     sym=[True, False],
     packing_format=AWQ_FORMAT,
     bits=[2, 4, 8],
     group_size=None,
-    compute_dtype=["float16", "bfloat16"],
+    priority=0,
+    checkers=[],
+    alias=["ark"],
+    compute_dtype=["float32", "float16"],
     data_type=["int"],
     act_bits=WOQ_DEFAULT_ACT_BITS,
-    priority=1,
-    checkers=[],
-    alias=["itrex", "qbits"],
-    requirements=["torch<2.7.0", "intel-extension-for-transformers"],
+    requirements=["torch>=2.9.0"],
 )
+
 BackendInfos["ipex_gptq"] = BackendInfo(
     device=["cpu", "xpu"],
     sym=[True, False],
@@ -601,12 +601,12 @@ def dynamic_import_inference_linear(backend, config):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
 
     This function dynamically loads the correct `QuantLinear` class based on the backend and quantization
-    configuration (e.g., qbits, marlin, hpu, gptq, awq, auto_round). It imports specific modules or raises
+    configuration (e.g., ark, marlin, hpu, gptq, awq). It imports specific modules or raises
     errors if the required packages are not installed or the environment is not set up.
 
     Args:
         backend (str):
-            The backend to be used for quantization (e.g., 'qbits', 'marlin', 'hpu', 'gptq', 'awq', 'auto_round').
+            The backend to be used for quantization (e.g., 'ark', 'marlin', 'hpu', 'gptq', 'awq').
         config (QuantizationScheme):
             The quantization configuration containing parameters like bits, group_size, and sym.
 
@@ -616,7 +616,7 @@ def dynamic_import_inference_linear(backend, config):
 
     Raises:
         ImportError:
-            If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq).
+            If required modules are missing for a backend (e.g., ark, GPTQ, auto_awq).
     """
     bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
 
@@ -629,26 +629,22 @@ def dynamic_import_inference_linear(backend, config):
     if "torch_nvfp4" in backend:
         return ar_qmodules.NVFP4QuantLinear
 
-    if "qbits" in backend:
+    if "auto_round_kernel" in backend or 'ark' in backend:
         try:
-            from intel_extension_for_transformers import qbits  # pylint: disable=E0401
+            import auto_round_kernel as ark  # pylint: disable=E0401
         except Exception as e:
             raise ImportError(
-                "Please install Intel Extension for Transformers via 'pip install "
-                "intel-extension-for-transformers' to inference on X86 CPU"
+                "Please install auto_round_kernel version for CPU/XPU"
             )
+        import auto_round_extension.kernel.qlinear as qlinear
+        
         if "zp" in backend:
-            import auto_round_extension.qbits.qlinear_qbits_gptq as qlinear_qbits_gptq
-
-            return qlinear_qbits_gptq.QuantLinear
+            return qlinear.QuantLinearGPTQ
         elif "awq" in backend:
-            import auto_round_extension.qbits.qbits_awq as qlinear_qbits_awq
-
-            return qlinear_qbits_awq.QuantLinear
+            return qlinear.QuantLinearAWQ
         else:  # auto_round must be at the end
-            import auto_round_extension.qbits.qlinear_qbits as qlinear_qbits_autoround
+            return qlinear.QuantLinear
 
-            return qlinear_qbits_autoround.QuantLinear
     if "ipex_gptq" in backend:
         from auto_round_extension.ipex.qlinear_ipex_gptq import QuantLinear
 
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
@@ -31,7 +31,6 @@
 from auto_round.inference.utils import _expand_regex_config
 from auto_round.logger import logger
 from auto_round.schemes import QuantizationScheme
-from auto_round.special_model_handler import _handle_moe_model
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
     check_start_with_block_name,
@@ -395,9 +394,9 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features)
     bias = layer.bias is not None
 
     # Special handling for AWQ layers
-    from auto_round_extension.qbits.qbits_awq import QuantLinear as QBitsAWQQuantLinear
+    from auto_round_extension.ark.qlinear import QuantLinearAWQ
 
-    if "awq" in layer_backend and isinstance(QuantLinear, QBitsAWQQuantLinear):
+    if "awq" in layer_backend and isinstance(QuantLinear, QuantLinearAWQ):
         return QuantLinear.from_linear(
             layer, config["bits"], config["group_size"], init_only=True, has_zero_points=not config["sym"]
         )
@@ -474,7 +473,6 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
     need_gptqmodel_init = False
     need_ipex_itrex_init = False
     used_gptq_exllamav2 = False
-
     # Determine which backends require post-init
     for backend in used_backends:
         if backend.startswith("auto_gptq"):
@@ -483,7 +481,7 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
                 used_gptq_exllamav2 = True
         elif backend.startswith("gptqmodel"):
             need_gptqmodel_init = True
-        elif backend.startswith(("ipex", "qbit")):
+        elif backend.startswith(("ipex", "auto_round_kernel")):
             need_ipex_itrex_init = True
 
     # AutoGPTQ post-init
@@ -503,7 +501,7 @@ def post_init(model: torch.nn.Module, used_backends: list[str]) -> None:
         message = "repacking to CPU/XPU format"
         layers = []  ## ipex post_init  will add one more layer
         for n, m in model.named_modules():
-            if hasattr(m, "QUANT_TYPE") and ("qbits" in m.QUANT_TYPE or "ipex" in m.QUANT_TYPE):
+            if hasattr(m, "QUANT_TYPE") and ("ark" in m.QUANT_TYPE or "ipex" in m.QUANT_TYPE):
                 layers.append(m)
 
         for layer in tqdm(layers, desc=message, total=len(layers), leave=True):
@@ -583,9 +581,6 @@ def convert_hf_model(model: nn.Module, target_device: str = "cpu") -> tuple[nn.M
     elif packing_format == "auto_round:gptq":
         packing_format = "auto_round:auto_gptq"
 
-    # Preprocess model before replace layers
-    model = _handle_moe_model(model)
-
     # Replace layers with quantized versions
     layer_configs = get_layer_config(model, quantization_config)
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, packing_format)
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
@@ -27,7 +27,7 @@ class TestAutoRoundTorchBackend(unittest.TestCase):
 
     @classmethod
     def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = "facebook/opt-125m"
         self.save_folder = "./saved"
         self.llm_dataloader = LLMDataLoader()
 
@@ -63,7 +63,7 @@ def tearDownClass(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_torch_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
@@ -81,7 +81,7 @@ def test_torch_4bits_asym(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -92,7 +92,7 @@ def test_torch_4bits_asym(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
+            self.save_folder, dtype=torch.bfloat16, device_map="cpu", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
@@ -104,9 +104,9 @@ def test_torch_4bits_asym(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_torch_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        bits, group_size, sym = 4, 128, True
+        bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
             model,
             tokenizer,
@@ -122,12 +122,12 @@ def test_torch_4bits_sym(self):
 
         quantization_config = AutoRoundConfig(backend="torch")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, torch_dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
+            quantized_model_path, dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
         self.model_infer(model, tokenizer)
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
         torch.cuda.empty_cache()