diff --git a/auto_round/__init__.py b/auto_round/__init__.py
index 87c70e06a..509885b33 100644
--- a/auto_round/__init__.py
+++ b/auto_round/__init__.py
@@ -22,18 +22,4 @@
 
 monkey_patch()
 
-
-def __getattr__(name):
-    if name == "AutoHfQuantizer":
-        from auto_round.inference.auto_quantizer import AutoHfQuantizer
-
-        return AutoHfQuantizer
-    if name == "AutoRoundConfig":
-        from auto_round.inference.auto_quantizer import AutoRoundConfig
-
-        return AutoRoundConfig
-
-    raise AttributeError(f"auto-round has no attribute '{name}'")
-
-
 from .version import __version__
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 1540d254d..5b9616f65 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -17,7 +17,8 @@
 
 from auto_round.auto_scheme import AutoScheme
 from auto_round.compressors import BaseCompressor
-from auto_round.eval.eval_cli import EvalArgumentParser, _eval_init, eval, eval_task_by_task
+from auto_round.eval.eval_cli import EvalArgumentParser, eval, eval_task_by_task
+from auto_round.eval.evaluation import run_model_evaluation
 from auto_round.schemes import PRESET_SCHEMES
 from auto_round.utils import (
     clear_memory,
@@ -383,6 +384,20 @@ def __init__(self, *args, **kwargs):
         eval_args.add_argument(
             "--eval_task_by_task", action="store_true", help="Evaluate tasks sequentially instead of batching. "
         )
+        eval_args.add_argument(
+            "--eval_backend",
+            default="hf",
+            type=str,
+            choices=["hf", "vllm"],
+            help="Backend to use for model evaluation. Use hf backend for evaluation by default.",
+        )
+        eval_args.add_argument(
+            "--vllm_args",
+            default=None,
+            type=str,
+            help="(for vllm) Custom vllm arguments in format: '--arg1=value1,--arg2=value2'. "
+            "Example: '--tensor_parallel_size=2,--gpu_memory_utilization=0.9'",
+        )
         eval_args.add_argument(
             "--eval_model_dtype",
             default=None,
@@ -703,185 +718,15 @@ def tune(args):
             suffix = f"g{autoround.group_size}"
         export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}")
 
+    # ======================= Quantize and save model =======================
     model, folders = autoround.quantize_and_save(export_dir, format=args.format)  # pylint: disable=E1101
     tokenizer = autoround.tokenizer  # pylint: disable=E1101
 
     model.eval()
     clear_memory()
 
-    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
-
-    # diffusion model has different evaluation path
-    if getattr(autoround, "diffusion", False):
-        pipe = autoround.pipe
-        pipe.to(model.dtype)
-        pipe.transformer = model
-        device_str = detect_device(device_str)
-        pipe = pipe.to(device_str)
-        if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto":
-            pipe.to(getattr(torch, eval_model_dtype))
-
-        gen_kwargs = {
-            "guidance_scale": args.guidance_scale,
-            "output_type": "pil",
-            "num_inference_steps": args.num_inference_steps,
-            "generator": (
-                None
-                if args.generator_seed is None
-                else torch.Generator(device=pipe.device).manual_seed(args.generator_seed)
-            ),
-        }
-        if not os.path.exists(args.image_save_dir):
-            os.makedirs(args.image_save_dir)
-
-        if args.prompt is not None:
-            outputs = pipe(prompt=args.prompt, **gen_kwargs)
-            outputs.images[0].save(os.path.join(args.image_save_dir, "img.png"))
-            logger.info(
-                f"Image generated with prompt {args.prompt} is saved as {os.path.join(args.image_save_dir, 'img.png')}"
-            )
-
-        if args.prompt_file is not None:
-            from auto_round.compressors.diffusion import diffusion_eval
-
-            metrics = args.metrics.split(",")
-            diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs)
-        return
-
-    lm_eval_version = get_library_version("lm-eval")
-
-    eval_folder = folders[-1]
-    if args.tasks is None or args.tasks == "" or eval_folder is None:
-        return
-
-    tasks = args.tasks
-    if isinstance(tasks, str):
-        tasks = tasks.split(",")
-
-    from lm_eval.utils import make_table  # pylint: disable=E0401
-
-    logger.info(f"Using lm-eval version {lm_eval_version}")
-    eval_gguf_model = False
-    for file in os.listdir(eval_folder):
-        if file.endswith("gguf"):
-            eval_gguf_model = True
-            break
-
-    import time
-
-    if "llama" in args.model.lower() and not args.add_bos_token:
-        logger.warning("set add_bos_token=True for llama model.")
-        args.add_bos_token = True
-    if (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model:
-        if eval_gguf_model:
-            # for file in os.listdir(eval_folder):
-            #     gguf_file = file
-            gguf_file = None
-            gguf_format = None  # Initialize gguf_format to None
-            # gguf folder only contains one file
-            for format in formats:
-                if format.startswith("gguf"):
-                    gguf_format = format.split(":")[-1].upper()
-            if gguf_format is None:  # Validate gguf_format after the loop
-                logger.error("No valid gguf format found in formats. Please check the input.")
-                sys.exit(-1)
-            for file in os.listdir(eval_folder):
-                if gguf_format in file:
-                    gguf_file = file
-
-            logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.")
-            if eval_model_dtype == "float32" or eval_model_dtype == "auto":
-                logger.warning(
-                    "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
-                    " but may affect accuracy."
-                )
-            if gguf_file is None:
-                logger.error("Cannot find correct gguf file for evaluation, please check.")
-                sys.exit(-1)
-            model = AutoModelForCausalLM.from_pretrained(
-                eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
-            )
-            model.eval()
-            tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)
-        else:
-            if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-                from accelerate.big_modeling import dispatch_model
-
-                dispatch_model(model, model.hf_device_map)
-            else:
-                device_str = detect_device(device_str)
-                model = model.to(device_str)
-            if model.dtype != eval_model_dtype and eval_model_dtype != "auto":
-                model.to(getattr(torch, eval_model_dtype))
-
-        if args.eval_task_by_task:
-            eval_task_by_task(
-                model,
-                tokenizer=tokenizer,
-                device=device_str,
-                tasks=args.tasks,
-                limit=args.limit,
-                batch_size=args.eval_bs,
-                eval_model_dtype=eval_model_dtype,
-                add_bos_token=args.add_bos_token,
-            )
-        else:
-            if args.eval_bs is None or args.eval_bs == "auto":
-                logger.warning("This API does not support auto currently, reset eval_bs to 16")
-                args.eval_bs = 16
-            from auto_round.eval.evaluation import simple_evaluate_user_model
-
-            st = time.time()
-
-            res = simple_evaluate_user_model(
-                model,
-                tokenizer,
-                tasks=tasks,
-                batch_size=args.eval_bs,
-                limit=args.limit,
-                device=device_str,
-                eval_model_dtype=eval_model_dtype,
-                add_bos_token=args.add_bos_token,
-            )
-            print(make_table(res))
-            print("evaluation running time=%ds" % (time.time() - st))
-    else:
-        if args.eval_task_by_task:
-            eval_task_by_task(
-                eval_folder,
-                device=device_str,
-                tasks=args.tasks,
-                batch_size=args.eval_bs,
-                limit=args.limit,
-                eval_model_dtype=eval_model_dtype,
-                mllm=autoround.mllm,  # pylint: disable=E1101
-                add_bos_token=args.add_bos_token,
-            )
-        else:
-            from auto_round.eval.evaluation import simple_evaluate
-
-            tasks, model_args, device_str = _eval_init(
-                args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype
-            )
-            st = time.time()
-            model_args += f",add_bos_token={args.add_bos_token}"
-            if autoround.mllm:  # pylint: disable=E1101
-                model_type = "hf-multimodal"
-                if args.eval_bs is None or args.eval_bs == "auto":
-                    logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
-                    args.eval_bs = 16
-            else:
-                model_type = "hf"
-            res = simple_evaluate(
-                model=model_type,
-                model_args=model_args,
-                tasks=tasks,
-                device=device_str,
-                batch_size=args.eval_bs,
-                limit=args.limit,
-            )
-            print(make_table(res))
-            print("evaluation running time=%ds" % (time.time() - st))
+    # ======================= Model evaluation =======================
+    run_model_evaluation(model, tokenizer, autoround, folders, formats, device_str, args)
 
 
 def setup_eval_parser():
@@ -909,6 +754,7 @@ def run_eval():
         eval_task_by_task(
             model=args.model,
             device=args.device_map,
+            limit=args.limit,
             tasks=args.tasks,
             batch_size=args.eval_bs,
             trust_remote_code=not args.disable_trust_remote_code,
diff --git a/auto_round/compressors/mllm/eval.py b/auto_round/compressors/mllm/eval.py
index 339bb0fd7..527ea2be4 100644
--- a/auto_round/compressors/mllm/eval.py
+++ b/auto_round/compressors/mllm/eval.py
@@ -100,11 +100,6 @@ def mllm_eval(
     mode: str = "all",
     ignore: bool = False,
 ):
-    try:
-        from transformers import AutoRoundConfig
-    except:
-        from auto_round.inference.auto_quantizer import AutoHfQuantizer
-
     model = None
     if data_store_dir is not None:
         if not os.path.exists(data_store_dir):
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 0e31209ad..863a9110c 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -19,8 +19,8 @@
 from transformers.utils.versions import require_version
 
 from auto_round.utils import (
-    clear_memory,
     get_device_and_parallelism,
+    get_device_str,
     get_model_dtype,
     set_cuda_visible_devices,
 )
@@ -52,10 +52,9 @@ def __init__(self, *args, **kwargs):
             "--devices",
             default="0",
             type=str,
-            help="the device to be used for tuning. "
-            "Currently, device settings support CPU, GPU, and HPU."
-            "The default is set to cuda:0,"
-            "allowing for automatic detection and switch to HPU or CPU."
+            help="the device to be used for evaluation. "
+            "The default is set to 0,"
+            "allowing for automatic detection and switch to any devices."
             "set --device 0,1,2 to use multiple cards.",
         )
 
@@ -105,28 +104,13 @@ def __init__(self, *args, **kwargs):
             help="Backend to use for model evaluation. Use hf backend for evaluation by default.",
         )
         self.add_argument("--add_bos_token", action="store_true", help="add BOS token")
-
-        # vllm related arguments
-        vllm_args = self.add_argument_group("vllm backend arguments")
-        vllm_args.add_argument("--revision", default=None, type=str, help="model revision for vllm")
-        vllm_args.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm")
-        vllm_args.add_argument(
-            "--tokenizer_mode", default="auto", type=str, help="tokenizer mode for vllm (e.g. auto/fast/slow)"
-        )
-        vllm_args.add_argument("--tokenizer_revision", default=None, type=str, help="tokenizer revision for vllm")
-        vllm_args.add_argument("--prefix_token_id", default=None, type=int, help="prefix token id for vllm")
-        vllm_args.add_argument("--tensor_parallel_size", default=1, type=int, help="tensor parallel size for vllm")
-        vllm_args.add_argument("--data_parallel_size", default=1, type=int, help="data parallel size for vllm")
-        vllm_args.add_argument("--quantization", default=None, type=str, help="quantization setting for vllm")
-        vllm_args.add_argument("--max_gen_toks", default=256, type=int, help="max generation tokens for vllm")
-        vllm_args.add_argument("--swap_space", default=4, type=float, help="swap space (GB) for vllm")
-        vllm_args.add_argument("--max_batch_size", default=None, type=int, help="max batch size for vllm")
-        vllm_args.add_argument("--max_length", default=None, type=int, help="max generation length for vllm")
-        vllm_args.add_argument("--max_model_len", default=None, type=int, help="maximum model sequence length for vllm")
-        vllm_args.add_argument(
-            "--gpu_memory_utilization", default=0.9, type=float, help="target GPU memory utilization for vllm"
+        self.add_argument(
+            "--vllm_args",
+            default=None,
+            type=str,
+            help="(for vllm) Custom vllm arguments in format: 'arg1=value1,arg2=value2'. "
+            "Example: 'tensor_parallel_size=2,gpu_memory_utilization=0.9'",
         )
-        vllm_args.add_argument("--lora_local_path", default=None, type=str, help="local LoRA path for vllm")
 
 
 def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype="auto"):
@@ -162,36 +146,12 @@ def eval(args):
 
     if (batch_size := args.eval_bs) is None:
         batch_size = "auto:8"
-    is_gguf_file = False
-    if os.path.exists(args.model):
-        if os.path.isfile(args.model) and args.model.endswith(".gguf"):
-            is_gguf_file = True
-            gguf_file = os.path.basename(args.model)
-            model = os.path.dirname(args.model)
-        else:
-            for file in os.listdir(args.model):
-                if file.endswith(".gguf"):
-                    is_gguf_file = True
-                    gguf_file = file
-            model = args.model
-    eval_model_dtype = get_model_dtype(args.eval_model_dtype)
+
+    model, tokenizer, is_gguf_file, gguf_file = _load_gguf_model_if_needed(args.model, args.eval_model_dtype)
+
     if is_gguf_file:
-        import torch
         from lm_eval.utils import make_table  # pylint: disable=E0401
-        from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file)
-
-        logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.")
-        if eval_model_dtype == "float32" or eval_model_dtype == "auto":
-            logger.warning(
-                "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
-                " but may affect accuracy."
-            )
-        model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
-        )
-        model.eval()
         st = time.time()
         res = simple_evaluate_user_model(
             model,
@@ -224,6 +184,79 @@ def eval(args):
         print("evaluation running time=%ds" % (time.time() - st))
 
 
+def eval_with_vllm(args):
+    import time
+
+    from lm_eval import evaluator  # pylint: disable=E0401
+    from lm_eval.models.vllm_causallms import VLLM  # pylint: disable=E0401
+    from lm_eval.models.vllm_vlms import VLLM_VLM  # pylint: disable=E0401
+    from lm_eval.utils import make_table  # pylint: disable=E0401
+
+    st = time.time()
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    device_str, _ = get_device_and_parallelism(args.device_map)
+    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
+    if (batch_size := args.eval_bs) is None:
+        batch_size = "auto:8"
+    if isinstance(args.tasks, str):
+        tasks = args.tasks.split(",")
+
+    # Parse custom vllm_args if provided
+    custom_vllm_kwargs = parse_vllm_args(getattr(args, "vllm_args", None))
+
+    # Build vllm kwargs with base parameters
+    vllm_kwargs = {
+        "pretrained": args.model,
+        "dtype": eval_model_dtype,
+        "trust_remote_code": not args.disable_trust_remote_code,
+        "add_bos_token": args.add_bos_token,
+        "device": device_str,
+        "batch_size": batch_size,
+        "allow_deprecated_quantization": True,  # for vLLM==0.14.0
+    }
+
+    # Override with custom vllm_args if provided
+    if custom_vllm_kwargs:
+        from auto_round.logger import logger
+
+        logger.info(f"Overriding VLLM parameters with custom args: {custom_vllm_kwargs}")
+        vllm_kwargs.update(custom_vllm_kwargs)
+
+    device = get_device_str()
+    environ_mapping = {
+        "cuda": "CUDA_VISIBLE_DEVICES",
+        "xpu": "ZE_AFFINITY_MASK",
+        "hpu": "HABANA_VISIBLE_MODULES",
+    }
+    if "tensor_parallel_size" not in vllm_kwargs:
+        # Parse device_map to determine tensor_parallel_size and set CUDA_VISIBLE_DEVICES
+        # Only accept formats like "0" or "0,1,2"
+        assert device in environ_mapping, f"Device {device} not supported for vllm tensor parallelism."
+        environ_name = environ_mapping[device]
+        device_map = args.device_map
+        device_ids = [d.strip() for d in str(device_map).split(",") if d.strip().isdigit()]
+        if device_ids:
+            device_id_str = ",".join(device_ids)
+            os.environ[environ_name] = device_id_str
+            tensor_parallel_size = len(device_ids)
+            vllm_kwargs["tensor_parallel_size"] = tensor_parallel_size
+            from auto_round.logger import logger
+
+            logger.info(
+                f"Set {environ_name}={os.environ[environ_name]}, " f"tensor_parallel_size={tensor_parallel_size}"
+            )
+
+    vllm_lm = VLLM_VLM(**vllm_kwargs) if args.mllm else VLLM(**vllm_kwargs)
+    res = evaluator.simple_evaluate(
+        model=vllm_lm,
+        tasks=tasks,
+        limit=args.limit,
+    )
+
+    print(make_table(res))
+    print("evaluation running time=%ds" % (time.time() - st))
+
+
 def eval_task_by_task(
     model,
     device=None,
@@ -256,34 +289,17 @@ def eval_task_by_task(
 
     if batch_size is None:
         batch_size = "auto:8"
-    is_gguf_file = False
+
     if not isinstance(model, str):
         parallelism = False
+        is_gguf_file = False
+        gguf_file = None
     else:
-        if os.path.isfile(model) and model.endswith(".gguf"):
-            is_gguf_file = True
-            gguf_file = os.path.basename(model)
-            model = os.path.dirname(model)
-        else:
-            for file in os.listdir(model):
-                if file.endswith(".gguf"):
-                    is_gguf_file = True
-                    gguf_file = file
-    eval_model_dtype = get_model_dtype(eval_model_dtype)
-    if is_gguf_file:
-        tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file)
-        logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.")
-        if eval_model_dtype == "float32" or eval_model_dtype == "auto":
-            logger.warning(
-                "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
-                " but may affect accuracy."
-            )
+        model, tokenizer, is_gguf_file, gguf_file = _load_gguf_model_if_needed(model, eval_model_dtype)
+        if is_gguf_file:
+            parallelism = False
 
-        model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
-        )
-        model.eval()
-        parallelism = False
+    eval_model_dtype = get_model_dtype(eval_model_dtype)
     if mllm:
         if batch_size is None or batch_size == "auto":
             logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
@@ -314,18 +330,93 @@ def eval_task_by_task(
             add_bos_token=add_bos_token,
         )
 
-    if isinstance(tasks, str):
-        tasks = tasks.replace(" ", "").split(",")
+    _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry_times)
 
+
+def _load_gguf_model_if_needed(model_path, eval_model_dtype=None):
+    """Detect and load GGUF model if the path points to a GGUF file.
+
+    Args:
+        model_path: Path to model or GGUF file
+        eval_model_dtype: Data type for model evaluation
+
+    Returns:
+        Tuple of (model, tokenizer, is_gguf_file, gguf_file_name)
+        If not a GGUF file, returns (model_path, None, False, None)
+    """
+    from auto_round.utils import logger
+
+    is_gguf_file = False
+    gguf_file = None
+    tokenizer = None
+    model = model_path
+
+    # Check if model_path is a string before processing
+    if isinstance(model_path, str):
+        if os.path.isfile(model_path) and model_path.endswith(".gguf"):
+            is_gguf_file = True
+            gguf_file = os.path.basename(model_path)
+            model = os.path.dirname(model_path)
+        elif os.path.exists(model_path):
+            for file in os.listdir(model_path):
+                if file.endswith(".gguf"):
+                    is_gguf_file = True
+                    gguf_file = file
+                    break
+
+    if is_gguf_file:
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        eval_model_dtype = get_model_dtype(eval_model_dtype)
+        tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file)
+
+        logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.")
+        if eval_model_dtype == "float32" or eval_model_dtype == "auto":
+            logger.warning(
+                "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
+                " but may affect accuracy."
+            )
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+        )
+        model.eval()
+
+    return model, tokenizer, is_gguf_file, gguf_file
+
+
+def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry_times):
+    """Evaluate tasks with automatic retry on OOM errors.
+
+    Args:
+        tasks: List of task names to evaluate
+        hflm: HuggingFace LM model instance
+        device_str: Device string for evaluation
+        batch_size: Batch size for evaluation
+        limit: Limit number of examples per task
+        retry_times: Number of retry attempts on failure
+
+    Returns:
+        Aggregated results dictionary containing results, versions, n-shot, and higher_is_better
+    """
+    import time
+    import traceback
+
+    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
     from lm_eval.utils import make_table  # pylint: disable=E0401
 
+    from auto_round.utils import logger
+
+    if isinstance(tasks, str):
+        tasks = tasks.replace(" ", "").split(",")
+
     res_all = {}
     res_keys = ["results", "versions", "n-shot", "higher_is_better"]
-    import time
-
     st = time.time()
+
     for task in tasks:
-        while retry_times:
+        current_retry_times = retry_times
+        while current_retry_times:
             try:
                 res = lm_simple_evaluate(
                     model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit
@@ -350,7 +441,7 @@ def eval_task_by_task(
                     logger.error(cuda_error_msg)
                     traceback.print_exc()
                     break
-            retry_times -= 1
+            current_retry_times -= 1
 
         if not res_all:
             res_all = res
@@ -365,49 +456,63 @@ def eval_task_by_task(
     print("total eval time:", time.time() - st)
 
 
-def eval_with_vllm(args):
-    import time
+def parse_vllm_args(vllm_args_str):
+    """Parse custom vllm arguments from string format.
 
-    from lm_eval import evaluator  # pylint: disable=E0401
-    from lm_eval.models.vllm_causallms import VLLM  # pylint: disable=E0401
-    from lm_eval.utils import make_table  # pylint: disable=E0401
+    Args:
+        vllm_args_str: String containing vllm arguments in format:
+                      "--arg1=value1,--arg2=value2" or "arg1=value1,arg2=value2"
 
-    st = time.time()
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    device_str, _ = get_device_and_parallelism(args.device_map)
-    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
-    if (batch_size := args.eval_bs) is None:
-        batch_size = "auto:8"
+    Returns:
+        Dictionary of parsed arguments with appropriate types (int, float, bool, or string)
 
-    vllm_lm = VLLM(
-        pretrained=args.model,
-        dtype=eval_model_dtype,
-        revision=args.revision,
-        trust_remote_code=not args.disable_trust_remote_code,
-        tokenizer=args.tokenizer,
-        tokenizer_mode=args.tokenizer_mode,
-        tokenizer_revision=args.tokenizer_revision,
-        add_bos_token=args.add_bos_token,
-        prefix_token_id=args.prefix_token_id,
-        tensor_parallel_size=args.tensor_parallel_size,
-        quantization=args.quantization,
-        max_gen_toks=args.max_gen_toks,
-        swap_space=args.swap_space,
-        batch_size=batch_size,
-        max_batch_size=args.max_batch_size,
-        max_length=args.max_length,
-        max_model_len=args.max_model_len,
-        seed=args.seed,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        device=device_str,
-        data_parallel_size=args.data_parallel_size,
-        lora_local_path=args.lora_local_path,
-    )
-    res = evaluator.simple_evaluate(
-        model=vllm_lm,
-        tasks=args.tasks,
-        limit=args.limit,
-    )
+    Example:
+        >>> parse_vllm_args("--tensor_parallel_size=2,--gpu_memory_utilization=0.9")
+        {'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.9}
+    """
+    from auto_round.logger import logger
 
-    print(make_table(res))
-    print("evaluation running time=%ds" % (time.time() - st))
+    custom_vllm_kwargs = {}
+
+    if not vllm_args_str:
+        return custom_vllm_kwargs
+
+    logger.info(f"Parsing custom vllm arguments: {vllm_args_str}")
+
+    for arg_pair in vllm_args_str.split(","):
+        arg_pair = arg_pair.strip()
+        # Normalize: replace space separator with '=' (e.g., "--arg value" -> "--arg=value")
+        if "=" not in arg_pair and " " in arg_pair:
+            parts = arg_pair.split(None, 1)  # Split on whitespace, max 2 parts
+            if len(parts) == 2:
+                arg_pair = f"{parts[0]}={parts[1]}"
+        if "=" in arg_pair:
+            # Remove leading '--' if present
+            arg_pair = arg_pair.removeprefix("--")
+            key, value = arg_pair.split("=", 1)
+            key = key.strip()
+            value = value.strip()
+
+            # Try to convert value to appropriate type
+            try:
+                # Try int first
+                if value.isdigit() or (value.startswith("-") and value[1:].isdigit()):
+                    custom_vllm_kwargs[key] = int(value)
+                # Try float
+                elif "." in value:
+                    custom_vllm_kwargs[key] = float(value)
+                # Try boolean
+                elif value.lower() in ("true", "false"):
+                    custom_vllm_kwargs[key] = value.lower() == "true"
+                # Keep as string
+                else:
+                    custom_vllm_kwargs[key] = value
+                logger.info(
+                    f"  Parsed vllm arg: {key}={custom_vllm_kwargs[key]}"
+                    + f" (type: {type(custom_vllm_kwargs[key]).__name__})"
+                )
+            except Exception as e:
+                logger.warning(f"  Failed to parse vllm arg '{key}={value}': {e}, keeping as string")
+                custom_vllm_kwargs[key] = value
+
+    return custom_vllm_kwargs
diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py
index 90f489ccf..1f4df7b69 100644
--- a/auto_round/eval/evaluation.py
+++ b/auto_round/eval/evaluation.py
@@ -15,14 +15,10 @@
 import os
 from typing import Optional, Union
 
-from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
-
 from auto_round.logger import logger
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
-from lm_eval.models.huggingface import HFLM  # pylint: disable=E0401
-
 
 def simple_evaluate_user_model(
     user_model,
@@ -33,8 +29,11 @@ def simple_evaluate_user_model(
     eval_model_dtype="auto",
     add_bos_token: bool = False,
     mllm: bool = False,
-    **kwargs
+    **kwargs,
 ):
+    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
+    from lm_eval.models.huggingface import HFLM  # pylint: disable=E0401
+
     if mllm:
         from lm_eval.models.hf_vlms import HFMultimodalLM  # pylint: disable=E0401
 
@@ -70,12 +69,9 @@ def simple_evaluate(
     limit: Optional[Union[int, float]] = None,
     max_batch_size: Optional[int] = None,
     device: Optional[str] = None,
-    **kwargs
+    **kwargs,
 ):
-    try:
-        from transformers import AutoRoundConfig
-    except:
-        from auto_round.inference.auto_quantizer import AutoHfQuantizer
+    from lm_eval import simple_evaluate as lm_simple_evaluate  # pylint: disable=E0401
 
     return lm_simple_evaluate(
         model=model,
@@ -84,5 +80,357 @@ def simple_evaluate(
         limit=limit,
         max_batch_size=max_batch_size,
         device=device,
-        **kwargs
+        **kwargs,
+    )
+
+
+def evaluate_diffusion_model(autoround, model, args):
+    """
+    Evaluate diffusion models.
+
+    Args:
+        autoround: AutoRound instance
+        model: Quantized model
+        args: Command line arguments
+    """
+    import torch
+
+    from auto_round.utils import detect_device, get_model_dtype, logger
+
+    # Prepare inference pipeline
+    pipe = autoround.pipe
+    pipe.to(model.dtype)
+    pipe.transformer = model
+    device_str = detect_device(args.device_map if hasattr(args, "device_map") else "0")
+    pipe = pipe.to(device_str)
+
+    # Set evaluation dtype
+    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
+    if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto":
+        pipe.to(getattr(torch, eval_model_dtype))
+
+    # Prepare generation kwargs
+    gen_kwargs = {
+        "guidance_scale": args.guidance_scale,
+        "output_type": "pil",
+        "num_inference_steps": args.num_inference_steps,
+        "generator": (
+            None
+            if args.generator_seed is None
+            else torch.Generator(device=pipe.device).manual_seed(args.generator_seed)
+        ),
+    }
+
+    # Create image save directory
+    if not os.path.exists(args.image_save_dir):
+        os.makedirs(args.image_save_dir)
+
+    # Single prompt generation
+    if args.prompt is not None:
+        outputs = pipe(prompt=args.prompt, **gen_kwargs)
+        save_path = os.path.join(args.image_save_dir, "img.png")
+        outputs.images[0].save(save_path)
+        logger.info(f"Image generated with prompt {args.prompt} is saved as {save_path}")
+
+    # Batch prompt evaluation
+    if args.prompt_file is not None:
+        from auto_round.compressors.diffusion import diffusion_eval
+
+        metrics = args.metrics.split(",")
+        diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs)
+
+
+def load_gguf_model_for_eval(eval_folder, formats, args):
+    """
+    Load GGUF model for evaluation.
+
+    Args:
+        eval_folder: Path to saved model
+        formats: List of export formats
+        args: Command line arguments
+
+    Returns:
+        model, tokenizer: Loaded model and tokenizer
+    """
+    import sys
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    from auto_round.utils import get_model_dtype, logger
+
+    # Find corresponding GGUF format
+    gguf_format = None
+    for format in formats:
+        if format.startswith("gguf"):
+            gguf_format = format.split(":")[-1].upper()
+            break
+
+    if gguf_format is None:
+        logger.error("No valid gguf format found in formats. Please check the input.")
+        sys.exit(-1)
+
+    # Find matching GGUF file
+    gguf_file = None
+    for file in os.listdir(eval_folder):
+        if gguf_format in file:
+            gguf_file = file
+            break
+
+    if gguf_file is None:
+        logger.error("Cannot find correct gguf file for evaluation, please check.")
+        sys.exit(-1)
+
+    # Load model and tokenizer
+    logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.")
+    eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
+
+    if eval_model_dtype in ["float32", "auto"]:
+        logger.warning(
+            "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model,"
+            " but may affect accuracy."
+        )
+
+    model = AutoModelForCausalLM.from_pretrained(
+        eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
     )
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)
+
+    return model, tokenizer
+
+
+def prepare_model_for_eval(model, device_str, eval_model_dtype):
+    """
+    Prepare model for evaluation.
+
+    Args:
+        model: Quantized model
+        device_str: Device string
+        eval_model_dtype: Evaluation data type
+
+    Returns:
+        model: Prepared model
+    """
+    import torch
+
+    from auto_round.utils import detect_device
+
+    # Handle multi-device model
+    if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+        from accelerate.big_modeling import dispatch_model
+
+        dispatch_model(model, model.hf_device_map)
+    else:
+        # Single device model
+        device_str = detect_device(device_str)
+        model = model.to(device_str)
+
+    # Convert dtype
+    if model.dtype != eval_model_dtype and eval_model_dtype != "auto":
+        model.to(getattr(torch, eval_model_dtype))
+
+    return model
+
+
+def evaluate_with_model_instance(model, tokenizer, device_str, args):
+    """
+    Evaluate with model instance.
+    Applicable to fake quantization and GGUF models.
+
+    Args:
+        model: Model instance
+        tokenizer: Tokenizer
+        device_str: Device string
+        args: Command line arguments
+    """
+    import time
+
+    from lm_eval.utils import make_table  # pylint: disable=E0401
+
+    from auto_round.eval.eval_cli import eval_task_by_task
+    from auto_round.utils import get_model_dtype, logger
+
+    tasks = args.tasks
+    if isinstance(tasks, str):
+        tasks = tasks.split(",")
+
+    # Task-by-task evaluation
+    if args.eval_task_by_task:
+        eval_task_by_task(
+            model,
+            tokenizer=tokenizer,
+            device=device_str,
+            tasks=args.tasks,
+            limit=args.limit,
+            batch_size=args.eval_bs,
+            eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"),
+            add_bos_token=args.add_bos_token,
+        )
+    else:
+        # Batch evaluation
+        eval_bs = args.eval_bs
+        if eval_bs is None or eval_bs == "auto":
+            logger.warning("This API does not support auto currently, reset eval_bs to 16")
+            eval_bs = 16
+
+        st = time.time()
+        res = simple_evaluate_user_model(
+            model,
+            tokenizer,
+            tasks=tasks,
+            batch_size=eval_bs,
+            limit=args.limit,
+            device=device_str,
+            eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"),
+            add_bos_token=args.add_bos_token,
+        )
+        print(make_table(res))
+        print("evaluation running time=%ds" % (time.time() - st))
+
+
+def evaluate_with_model_path(eval_folder, device_str, autoround, args):
+    """
+    Evaluate with model path.
+    Applicable to other quantization formats.
+
+    Args:
+        eval_folder: Path to saved model
+        device_str: Device string
+        autoround: AutoRound instance
+        args: Command line arguments
+    """
+    import time
+
+    from lm_eval.utils import make_table  # pylint: disable=E0401
+
+    from auto_round.eval.eval_cli import _eval_init, eval_task_by_task
+    from auto_round.utils import get_model_dtype, logger
+
+    tasks = args.tasks
+    if isinstance(tasks, str):
+        tasks = tasks.split(",")
+
+    # Task-by-task evaluation
+    if args.eval_task_by_task:
+        eval_task_by_task(
+            eval_folder,
+            device=device_str,
+            tasks=args.tasks,
+            batch_size=args.eval_bs,
+            limit=args.limit,
+            eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"),
+            mllm=autoround.mllm,
+            add_bos_token=args.add_bos_token,
+        )
+    else:
+        # Batch evaluation
+        tasks, model_args, device_str = _eval_init(
+            args.tasks,
+            eval_folder,
+            args.device_map,
+            args.disable_trust_remote_code,
+            dtype=get_model_dtype(args.eval_model_dtype, "auto"),
+        )
+
+        st = time.time()
+        model_args += f",add_bos_token={args.add_bos_token}"
+
+        # Choose evaluation method based on model type
+        if autoround.mllm:
+            model_type = "hf-multimodal"
+            eval_bs = args.eval_bs
+            if eval_bs is None or eval_bs == "auto":
+                logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
+                eval_bs = 16
+        else:
+            model_type = "hf"
+            eval_bs = args.eval_bs
+
+        res = simple_evaluate(
+            model=model_type,
+            model_args=model_args,
+            tasks=tasks,
+            device=device_str,
+            batch_size=eval_bs,
+            limit=args.limit,
+        )
+        print(make_table(res))
+        print("evaluation running time=%ds" % (time.time() - st))
+
+
+def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_str, args):
+    """
+    Run model evaluation.
+    Unified evaluation entry point that dispatches to different evaluation logic based on model type.
+
+    Args:
+        model: Quantized model
+        tokenizer: Tokenizer
+        autoround: AutoRound instance
+        folders: List of export folders
+        formats: List of export formats
+        device_str: Device string
+        args: Command line arguments
+    """
+    from auto_round.utils import get_library_version, get_model_dtype, logger
+
+    # Handle diffusion models separately
+    if getattr(autoround, "diffusion", False):
+        evaluate_diffusion_model(autoround, model, args)
+        return
+
+    # Check if evaluation is needed for language models
+    eval_folder = folders[-1] if folders else None
+    if args.tasks is None or args.tasks == "" or eval_folder is None:
+        return
+
+    # Handle vllm backend evaluation
+    if hasattr(args, "eval_backend") and args.eval_backend == "vllm":
+        from auto_round.eval.eval_cli import eval_with_vllm
+
+        # Create a minimal args object with essential parameters
+        vllm_args = type("Args", (), {})()
+        # Required parameters
+        vllm_args.model = eval_folder
+        vllm_args.tasks = args.tasks
+        vllm_args.device_map = getattr(args, "device_map", device_str)
+        # Optional common parameters
+        vllm_args.eval_bs = getattr(args, "eval_bs", None)
+        vllm_args.mllm = getattr(args, "mllm", None)
+        vllm_args.limit = getattr(args, "limit", None)
+        vllm_args.eval_model_dtype = getattr(args, "eval_model_dtype", None)
+        vllm_args.disable_trust_remote_code = getattr(args, "disable_trust_remote_code", False)
+        vllm_args.add_bos_token = getattr(args, "add_bos_token", False)
+        vllm_args.seed = getattr(args, "seed", 42)
+        # VLLM-specific parameters
+        vllm_args.vllm_args = getattr(args, "vllm_args", None)
+        eval_with_vllm(vllm_args)
+        return
+
+    lm_eval_version = get_library_version("lm-eval")
+    logger.info(f"Using lm-eval version {lm_eval_version}")
+
+    # Handle Llama model special case
+    if "llama" in args.model.lower() and not args.add_bos_token:
+        logger.warning("set add_bos_token=True for llama model.")
+        args.add_bos_token = True
+
+    # Check if GGUF model
+    eval_gguf_model = any(file.endswith("gguf") for file in os.listdir(eval_folder))
+
+    # Determine if model instance evaluation is needed
+    need_model_instance = (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model
+
+    if need_model_instance:
+        # Load or prepare model instance
+        if eval_gguf_model:
+            model, tokenizer = load_gguf_model_for_eval(eval_folder, formats, args)
+        else:
+            eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto")
+            model = prepare_model_for_eval(model, device_str, eval_model_dtype)
+
+        # Evaluate with model instance
+        evaluate_with_model_instance(model, tokenizer, device_str, args)
+    else:
+        # Evaluate with model path
+        evaluate_with_model_path(eval_folder, device_str, autoround, args)
diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py
deleted file mode 100644
index 33ab74d8d..000000000
--- a/auto_round/inference/auto_quantizer.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import importlib.util
-import warnings
-from dataclasses import dataclass
-from enum import Enum
-from logging import getLogger
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from packaging import version
-from transformers.modeling_utils import PreTrainedModel
-from transformers.quantizers import AutoQuantizationConfig, HfQuantizer
-from transformers.quantizers.auto import AUTO_QUANTIZER_MAPPING
-from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod
-
-from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init
-from auto_round.utils import is_hpex_available
-
-logger = getLogger(__name__)
-import sys
-
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-AUTOROUND_MINIMUM_VERSION = version.parse("0.2")
-
-
-def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
-    # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
-    try:  # TODO remove it later
-        import auto_round
-
-        return True, auto_round.__version__
-    except:
-        pass
-
-    package_exists = importlib.util.find_spec(pkg_name) is not None
-    package_version = "N/A"
-    if package_exists:
-        try:
-            package_version = importlib.metadata.version(pkg_name)
-            package_exists = True
-        except importlib.metadata.PackageNotFoundError:
-            package_exists = False
-    if return_version:
-        return package_exists, package_version
-    else:
-        return package_exists
-
-
-_auto_round_available = _is_package_available("auto_round")
-
-
-def is_auto_round_available():
-    try:
-        import auto_round
-
-        return True
-    except:
-        pass
-    if _auto_round_available:
-        version_autoround = version.parse(importlib_metadata.version("auto_round"))
-        if AUTOROUND_MINIMUM_VERSION < version_autoround:
-            return True
-        else:
-            raise ImportError(
-                f"Found an incompatible version of auto-round. Found version {version_autoround},"
-                f" but only version above {AUTOROUND_MINIMUM_VERSION} are supported"
-            )
-
-
-class AutoHfQuantizer:
-    """The Auto-HF quantizer class that takes care of automatically instantiating to the correct
-    `HfQuantizer` given the `QuantizationConfig`."""
-
-    @classmethod
-    def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict], **kwargs):
-        # Convert it to a QuantizationConfig if the q_config is a dict
-        if isinstance(quantization_config, dict):
-            if "auto-round" in quantization_config["quant_method"]:
-                quantization_config = AutoRoundConfig.from_dict(quantization_config)
-            else:
-                quantization_config = AutoQuantizationConfig.from_dict(quantization_config)  # pylint: disable=E1101
-        quant_method = quantization_config.quant_method
-
-        # Again, we need a special care for bnb as we have a single quantization config
-        # class for both 4-bit and 8-bit quantization
-        if quant_method == QuantizationMethod.BITS_AND_BYTES:
-            if quantization_config.load_in_8bit:
-                quant_method += "_8bit"
-            else:
-                quant_method += "_4bit"
-
-        if quant_method not in AUTO_QUANTIZER_MAPPING.keys() and "auto-round" not in quant_method:
-            raise ValueError(
-                f"Unknown quantization type, got {quant_method} - supported types are:"
-                f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
-            )
-        if "auto-round" in quant_method or is_hpex_available():  # pragma: no cover
-            target_cls = AutoRoundQuantizer
-        else:
-            target_cls = AUTO_QUANTIZER_MAPPING[quant_method]
-
-        return target_cls(quantization_config, **kwargs)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        quantization_config = AutoQuantizationConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        return cls.from_config(quantization_config)
-
-    @classmethod
-    def merge_quantization_configs(
-        cls,
-        quantization_config: Union[dict, QuantizationConfigMixin],
-        quantization_config_from_args: Optional[QuantizationConfigMixin],
-    ):
-        """Handles situations where both quantization_config
-        from args and quantization_config from model config are present."""
-        if quantization_config_from_args is not None:
-            warning_msg = (
-                "You passed `quantization_config` or equivalent parameters to "
-                "`from_pretrained` but the model you're loading"
-                " already has a `quantization_config` attribute. The `quantization_config` from the model will be used."
-            )
-        else:
-            warning_msg = ""
-        if quantization_config_from_args is None or not hasattr(
-            quantization_config_from_args, "get_loading_attributes"
-        ):
-            # If the quantization_config_from_args is None or does not have get_loading_attributes method,
-            # we will not use it to load the model.
-            quantization_config_from_args = None
-        else:
-            loading_attr_dict = quantization_config_from_args.get_loading_attributes()
-
-        if isinstance(quantization_config, dict):
-            if (
-                "auto-round" in quantization_config["quant_method"]
-                or quantization_config_from_args.__class__.__name__ == "AutoRoundConfig"
-            ):
-                quantization_config = AutoRoundConfig.from_dict(quantization_config)
-            else:
-                quantization_config = AutoQuantizationConfig.from_dict(quantization_config)  # pylint: disable=E1101
-
-        if (
-            isinstance(quantization_config, (GPTQConfig, AwqConfig, AutoRoundConfig))
-            and quantization_config_from_args is not None
-        ):
-            # special case for GPTQ / AWQ config collision
-
-            for attr, val in loading_attr_dict.items():
-                setattr(quantization_config, attr, val)
-            warning_msg += (
-                f"However, loading attributes (e.g. {list(loading_attr_dict.keys())}) "
-                f"will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
-            )
-
-        if warning_msg != "":
-            warnings.warn(warning_msg)
-
-        return quantization_config
-
-    @staticmethod
-    def supports_quant_method(quantization_config_dict):
-        from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING
-
-        AUTO_QUANTIZATION_CONFIG_MAPPING["auto-round"] = AutoRoundConfig
-        AUTO_QUANTIZATION_CONFIG_MAPPING["auto_round"] = AutoRoundConfig
-        quant_method = quantization_config_dict.get("quant_method", None)
-        if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False):
-            suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit"
-            quant_method = QuantizationMethod.BITS_AND_BYTES + suffix
-        elif quant_method is None:
-            raise ValueError(
-                "The model's quantization config from the arguments has no `quant_method` attribute."
-                "Make sure that the model has been correctly quantized"
-            )
-
-        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
-            logger.warning(
-                f"Unknown quantization type, got {quant_method} - supported types are:"
-                f" {list(AUTO_QUANTIZER_MAPPING.keys())}. Hence, we will skip the quantization. "
-                "To remove the warning, you can delete the quantization_config attribute in config.json"
-            )
-            return False
-        return True
-
-
-class AutoRoundQuantizationMethod(str, Enum):
-    AutoRound = "auto-round"
-
-
-@dataclass
-class AutoRoundConfig(QuantizationConfigMixin):
-    """This is a wrapper class about all possible attributes and features that you can play with a model that has been
-    loaded AutoRound quantization.
-
-    Args:
-        bits (`int`):
-            The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
-        tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
-            The tokenizer used to process the dataset. You can pass either:
-                - A custom tokenizer object.
-                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
-                    using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
-    """
-
-    def __init__(
-        self,
-        bits: int = 4,
-        tokenizer: Any = None,
-        dataset: str = None,
-        group_size: int = 128,
-        sym: bool = False,
-        backend="auto",
-        layer_config: dict = None,
-        **kwargs,
-    ):
-
-        self.bits = bits
-        self.tokenizer = tokenizer
-        self.dataset = dataset
-        self.group_size = group_size
-        self.sym = sym
-        self.packing_format = "auto_round:auto_gptq"
-        self.backend = backend
-        self.layer_config = layer_config
-        if kwargs is not None:
-            for key in kwargs.keys():
-                setattr(self, key, kwargs[key])
-        self.quant_method = AutoRoundQuantizationMethod.AutoRound
-        self.post_init()
-
-    def post_init(self):
-        r"""Safety checker that arguments are correct."""
-        if self.bits not in [2, 3, 4, 8]:
-            raise ValueError(f"Only support quantization to [2,3,4,8] bits but found {self.bits}")
-        if self.group_size != -1 and self.group_size <= 0:
-            raise ValueError("group_size must be greater than 0 or equal to -1")
-
-    def get_loading_attributes(self):
-        loading_attributes_dict = {"backend": self.backend}
-        return loading_attributes_dict
-
-    def to_dict(self):
-        config_dict = super().to_dict()
-        return config_dict
-
-    @classmethod
-    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
-        quant_method = config_dict["quant_method"]
-        if "auto-round" not in quant_method and "gptq" not in quant_method and "awq" not in quant_method:
-            raise NotImplementedError(
-                "Failed to convert to auto_round format. Only `gptqv1`, `awq`, and `auto-round` formats are supported."
-            )
-
-        if "gptq" in quant_method and "meta" in config_dict:
-            raise NotImplementedError("Failed to convert gptq format to auto_round format. Only supports `gptqv1`")
-
-        if "awq" in quant_method and config_dict.get("version", "gemm") != "gemm":
-            raise NotImplementedError(
-                "Failed to convert awq format to auto_round format. Only supports  awq format with gemm version"
-            )
-
-        if "auto-round" not in quant_method:
-            config_dict["packing_format"] = f"auto_round:{quant_method}"
-
-        return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs)
-
-
-class AutoRoundQuantizer(HfQuantizer):
-    """Quantizer of the AutoRound method, currently only triton and exllamav2 backend has been supported."""
-
-    requires_calibration = False
-    required_packages = ["auto_round"]
-    optimum_quantizer = None
-
-    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
-        self.device_map = None
-        super().__init__(quantization_config, **kwargs)
-
-    def validate_environment(self, *args, **kwargs):
-        self.device_map = kwargs.get("device_map", None)
-        if not is_auto_round_available():
-            raise ImportError(
-                "Loading a AutoRound quantized model requires auto-round library (`pip install " "auto-round`)"
-            )
-        else:
-            try:
-                import auto_round
-
-                autoround_version = version.parse(auto_round.__version__)
-            except:
-                autoround_version = version.parse(importlib.metadata.version("auto_round"))
-            if autoround_version < version.parse("0.2.0"):
-                raise ImportError(
-                    "You need a version of auto_round > 0.2.0 to use AutoRound: `pip install --upgrade "
-                    "auto-round` or install from source"
-                )
-
-    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
-        if torch_dtype is None:
-            torch_dtype = torch.bfloat16
-        return torch_dtype
-
-    def post_init_model(self, model):
-        """Post-initialization that require device information, for example buffers initialization on device.
-
-        Args:
-            model (`nn.Module`):
-                The input model
-        """
-
-        class StoreAttr(object):
-            pass
-
-        model.quantize_config = StoreAttr()
-
-        post_init(model, self.used_backends)
-
-    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
-        if self.pre_quantized:
-            target_device = infer_target_device(self.device_map)
-            model, used_backends = convert_hf_model(model, target_device)
-            self.used_backends = used_backends
-
-    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
-        if self.pre_quantized:
-            self.post_init_model(model)
-        else:
-            raise NotImplementedError
-
-    @property
-    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
-        return True
-
-    @property
-    def is_serializable(self):
-        return True
-
-
-import transformers
-
-if version.parse(transformers.__version__) < version.parse("4.38.0"):
-    logger.error("Please upgrade transformers>=4.38.0 to support lm-head quantization")
-
-transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
-transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index de9ea903f..c3dcbf59c 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -31,7 +31,7 @@
 import cpuinfo
 
 if TYPE_CHECKING:
-    from auto_quantizer import AutoRoundConfig
+    from transformers import AutoRoundConfig
 
 
 def get_cpu_manufacturer():
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 55e64565c..b25a6888e 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -1459,6 +1459,18 @@ def log_summary(self, msg: str = "", level: str = "info"):
         return summary
 
 
+def get_device_str():
+    """Get a string representation of the automatically detected device."""
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.xpu.is_available():  # pragma: no cover
+        return "xpu"
+    elif is_hpex_available():  # pragma: no cover
+        return "hpu"
+    else:  # pragma: no cover
+        return "cpu"
+
+
 # Global singleton instance
 memory_monitor = MemoryMonitor()
 
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index cc505e976..eb168774e 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -725,7 +725,14 @@ def module_match_name_list(module, name_list):
         return any(name.lower() in type(module).__name__.lower() for name in name_list)
 
     if module_match_name_list(
-        module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE", "DeepseekV2MoE", "DeepseekV3MoE"]
+        module,
+        [
+            "Qwen2MoeSparseMoeBlock",
+            "Qwen3MoeSparseMoeBlock",
+            "DeepseekMoE",
+            "DeepseekV2MoE",
+            "DeepseekV3MoE",
+        ],
     ):
         return ["gate_proj", "down_proj", "up_proj"]
     elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]):
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
index 54c47a2f2..40fff76d9 100644
--- a/docs/step_by_step.md
+++ b/docs/step_by_step.md
@@ -9,7 +9,7 @@ This document presents step-by-step instructions for auto-round llm quantization
   + [Customized Dataset](#customized-dataset)
   + [Dataset operations](#dataset-operations)
 * [3 Quantization](#3-quantization)
-  + [Supported Quantization Configurations](#supported-quantization-configurations)
+  + [Supported Quantization Schemes](#supported-quantization-schemes)
   + [Supported Export Formats](#supported-export-formats)
   + [Hardware Compatibility](#hardware-compatibility)
   + [Environment Configuration](#environment-configuration)
@@ -39,8 +39,9 @@ This document presents step-by-step instructions for auto-round llm quantization
   + [Specify Inference Backend](#specify-inference-backend)
   + [Convert GPTQ/AWQ to AutoRound](#convert-gptq-awq-to-autoround)
 * [5 Evaluation](#5-evaluation)
-  + [Combine evaluation with tuning](#combine-evaluation-with-tuning)
-  + [Eval the Quantized model](#eval-the-quantized-model)
+  + [Single GPU Evaluation](#single-gpu-evaluation)
+  + [Multi-GPU Evaluation](#multi-gpu-evaluation)
+  + [Important Notes](#important-notes)
 * [6 Known Issues](#6-known-issues)
 
 ## 1 Prerequisite
@@ -129,7 +130,7 @@ AutoRound supports several Schemes:
 
 Besides, you could modify the `group_size`, `bits`, `sym` and many other configs you want, though there are maybe no real kernels.
 
-### Supported export Formats
+### Supported Export Formats
 You can use command `auto_round list format` to show all supported formats with support scheme.
 
 **AutoRound Format**: This format is well-suited for CPU, Intel GPU, CUDA and HPU devices, 2 bits, as well as mixed-precision
@@ -744,47 +745,43 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal
 
 ## 5 Evaluation
 
-### Combine evaluation with tuning
-
-- We leverage lm-eval-harnessing for the evaluation. 
-If not explicitly specify '--task', the default value will be used (typically covering 10+ common tasks).
-  ~~~bash
-   auto-round --model Qwen/Qwen3-0.6B  --bits 4 --format "auto_round,auto_gptq" --tasks mmlu
-  ~~~
-  The last format will be used in evaluation if multiple formats have been exported.
-
-
-###  Eval the Quantized model
-
-- AutoRound format
-  For lm-eval-harness, you could just call
-  ~~~bash
-  auto-round --model="your_model_path" --eval  --tasks lambada_openai --eval_bs 16
-  ~~~
-  > Note: To use the vllm backend, add `--eval_backend vllm` to the command above. Common vllm parameters are already supported, such as `--tensor_parallel_size`.
-
-  Multiple gpu evaluation
-  ~~~bash
-  auto-round --model="your_model_path" --eval  --device 0,1 --tasks lambada_openai --eval_bs 16
-  ~~~
-  For other evaluation framework, if the framework could support Huggingface models, typically it could support
-  AutoRound format, only you need to do is import the following in the beginning of your code
-  ~~~python
-  from auto_round import AutoRoundConfig
-  ~~~  
-
-- AutoGPTQ/AutoAWQ format
-
-  Please refer to their repo and check the evaluation framework's compatibility.
-  For lm-eval-harness, you could just call
-  ~~~bash
-  lm_eval --model hf --model_args pretrained="your_model_path" --device cuda:0 --tasks lambada_openai --batch_size 16
-  ~~~
-  Multiple gpu evaluation
-  ~~~bash
-  CUDA_VISIBLE_DEVICES=0,1 lm_eval --model hf --model_args pretrained="your_model_path",parallelize=True --tasks lambada_openai --batch_size 16
-  ~~~
+AutoRound leverages [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) for evaluation. If `--tasks` is not specified, a set of default tasks (typically 10+ common benchmarks) will be automatically used.
 
+### Single GPU Evaluation
+
+**HF Backend (default):**
+```bash
+auto-round --model Qwen/Qwen3-0.6B --bits 4 --format "auto_round,auto_gptq" --tasks mmlu
+```
+
+**vLLM Backend:**
+```bash
+auto-round --model Qwen/Qwen3-0.6B --bits 4 --format "auto_round,auto_gptq" --tasks mmlu --eval_backend vllm
+```
+
+### Multi-GPU Evaluation
+
+**HF Backend:**
+```bash
+auto-round --model="your_model_path" --eval --device_map 0,1 --tasks lambada_openai --eval_bs 16
+```
+
+**vLLM Backend (Option 1 - using --device_map):**
+```bash
+auto-round "your_model_path" --eval --device_map 0,1 --tasks lambada_openai --eval_backend vllm
+```
+
+**vLLM Backend (Option 2 - manual configuration):**
+```bash
+CUDA_VISIBLE_DEVICES=0,1 auto-round "your_model_path" --eval --tasks lambada_openai --eval_backend vllm --vllm_args="tensor_parallel_size=2,gpu_memory_utilization=0.8"
+```
+
+### Important Notes
+
+- Use the `--eval` flag to evaluate models directly. This supports both original and quantized models.
+- The `--eval_task_by_task` option helps handle task failures by evaluating tasks sequentially. This only applies to the HF backend.
+- When multiple formats are exported, the last format in the list will be used for evaluation.
+- For vLLM backend, you can use `--device 0,1,2` to specify GPU devices. This will automatically set `CUDA_VISIBLE_DEVICES` and configure `tensor_parallel_size` based on the number of devices. Alternatively, you can manually set these via environment variables and `--vllm_args`.
 
 
 ## 6 Known Issues
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index 361f1bdf9..c54a57bd3 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -3,9 +3,9 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
 from ..helpers import get_model_path, model_infer
diff --git a/test/test_cpu/advanced/test_evaluation_functions.py b/test/test_cpu/advanced/test_evaluation_functions.py
new file mode 100644
index 000000000..9cc99311d
--- /dev/null
+++ b/test/test_cpu/advanced/test_evaluation_functions.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+CPU tests for evaluation utility functions.
+Lightweight tests focusing on key utility functions without heavy model loading.
+
+Run with: pytest test/test_cpu/advanced/test_evaluation_functions.py
+"""
+
+import os
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+class TestParseVllmArgs:
+    """Test parse_vllm_args function for parsing custom vllm arguments."""
+
+    def test_parse_vllm_args_empty(self):
+        """Test parsing empty vllm_args."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args(None)
+        assert result == {}
+
+        result = parse_vllm_args("")
+        assert result == {}
+
+    def test_parse_vllm_args_integers(self):
+        """Test parsing integer arguments."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args("--tensor_parallel_size=2,--max_model_len=4096")
+        assert result == {"tensor_parallel_size": 2, "max_model_len": 4096}
+        assert isinstance(result["tensor_parallel_size"], int)
+        assert isinstance(result["max_model_len"], int)
+
+    def test_parse_vllm_args_floats(self):
+        """Test parsing float arguments."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args("--gpu_memory_utilization=0.9,--swap_space=4.5")
+        assert result == {"gpu_memory_utilization": 0.9, "swap_space": 4.5}
+        assert isinstance(result["gpu_memory_utilization"], float)
+        assert isinstance(result["swap_space"], float)
+
+    def test_parse_vllm_args_booleans(self):
+        """Test parsing boolean arguments."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args("--trust_remote_code=true,--enable_lora=false")
+        assert result == {"trust_remote_code": True, "enable_lora": False}
+        assert isinstance(result["trust_remote_code"], bool)
+        assert isinstance(result["enable_lora"], bool)
+
+    def test_parse_vllm_args_strings(self):
+        """Test parsing string arguments."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args("--tokenizer_mode=auto,--quantization=awq")
+        assert result == {"tokenizer_mode": "auto", "quantization": "awq"}
+        assert isinstance(result["tokenizer_mode"], str)
+        assert isinstance(result["quantization"], str)
+
+    def test_parse_vllm_args_mixed_types(self):
+        """Test parsing mixed type arguments."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args(
+            "--tensor_parallel_size=2,--gpu_memory_utilization=0.9,--trust_remote_code=true,--tokenizer_mode=auto"
+        )
+        assert result == {
+            "tensor_parallel_size": 2,
+            "gpu_memory_utilization": 0.9,
+            "trust_remote_code": True,
+            "tokenizer_mode": "auto",
+        }
+
+    def test_parse_vllm_args_without_double_dash(self):
+        """Test parsing arguments without leading '--'."""
+        from auto_round.eval.eval_cli import parse_vllm_args
+
+        result = parse_vllm_args("tensor_parallel_size=2,max_model_len=4096")
+        assert result == {"tensor_parallel_size": 2, "max_model_len": 4096}
+
+
+class TestLoadGgufModelIfNeeded:
+    """Test _load_gguf_model_if_needed function for GGUF model detection and loading."""
+
+    def test_load_gguf_model_non_gguf_string_path(self):
+        """Test with non-GGUF model path (string)."""
+        from auto_round.eval.eval_cli import _load_gguf_model_if_needed
+
+        model_path = "/path/to/regular/model"
+        model, tokenizer, is_gguf, gguf_file = _load_gguf_model_if_needed(model_path)
+
+        assert model == model_path
+        assert tokenizer is None
+        assert is_gguf is False
+        assert gguf_file is None
+
+    def test_load_gguf_model_non_string_model(self, tiny_opt_model_path):
+        """Test with model object (not a string path)."""
+        from auto_round.eval.eval_cli import _load_gguf_model_if_needed
+
+        model, tokenizer, is_gguf, gguf_file = _load_gguf_model_if_needed(tiny_opt_model_path)
+        assert tokenizer is None
+        assert is_gguf is False
+        assert gguf_file is None
diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py
index 5c70f7e99..ce4bc0049 100644
--- a/test/test_cpu/backends/test_torch_backend.py
+++ b/test/test_cpu/backends/test_torch_backend.py
@@ -2,9 +2,9 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
diff --git a/test/test_cpu/schemes/test_auto_scheme.py b/test/test_cpu/schemes/test_auto_scheme.py
index 9d549076f..9bd362bf3 100644
--- a/test/test_cpu/schemes/test_auto_scheme.py
+++ b/test/test_cpu/schemes/test_auto_scheme.py
@@ -1,8 +1,9 @@
 import shutil
 
 import pytest
+from transformers import AutoRoundConfig
 
-from auto_round import AutoRound, AutoRoundConfig, AutoScheme
+from auto_round import AutoRound, AutoScheme
 
 
 class TestAutoScheme:
diff --git a/test/test_cuda/advanced/test_evaluation.py b/test/test_cuda/advanced/test_evaluation.py
new file mode 100644
index 000000000..8f67633ab
--- /dev/null
+++ b/test/test_cuda/advanced/test_evaluation.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPU tests for VLLM evaluation functionality.
+Tests the eval_with_vllm function and custom vllm_args parameter parsing.
+Validates accuracy thresholds for quantized models.
+
+Run with: pytest test/test_cuda/advanced/test_evaluation.py -v
+"""
+
+import os
+import sys
+
+import pytest
+
+from ...helpers import opt_name_or_path
+
+# Test models for vllm evaluation
+VLLM_EVAL_MODELS = [
+    "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  # auto_round:auto_gptq format
+]
+
+
+@pytest.mark.skipif(
+    not os.path.exists("/usr/bin/nvidia-smi") and not os.path.exists("/usr/local/cuda"), reason="CUDA not available"
+)
+class TestVllmEvaluation:
+    """Test VLLM backend evaluation functionality."""
+
+    @pytest.mark.parametrize("model", VLLM_EVAL_MODELS)
+    def test_vllm_backend_with_custom_args(self, model):
+        """Test vllm backend evaluation with custom vllm_args parameter."""
+        python_path = sys.executable
+
+        os.environ["VLLM_SKIP_WARMUP"] = "true"
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        # Test with custom vllm_args
+        cmd = f"{python_path} -m auto_round --model {model} --eval --tasks lambada_openai --eval_bs 128 --eval_backend vllm --limit 100 --vllm_args tensor_parallel_size=1,gpu_memory_utilization=0.6,max_model_len=2048"
+
+        ret = os.system(cmd)
+
+        assert ret == 0, f"vllm evaluation with custom args failed (rc={ret})"
+
+    def test_vllm_backend_with_quantization_iters_0(self):
+        """Test vllm evaluation with iters=0 (quantization without fine-tuning)."""
+        python_path = sys.executable
+
+        os.environ["VLLM_SKIP_WARMUP"] = "true"
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        cmd = f"{python_path} -m auto_round --model {opt_name_or_path} --iters 0 --disable_opt_rtn --tasks lambada_openai --eval_bs 8 --eval_backend vllm --limit 100"
+
+        ret = os.system(cmd)
+
+        assert ret == 0, f"vllm evaluation with iters=0 failed (rc={ret})"
+
+
+@pytest.mark.skipif(
+    not os.path.exists("/usr/bin/nvidia-smi") and not os.path.exists("/usr/local/cuda"), reason="CUDA not available"
+)
+class TestHFEvaluation:
+    """Test different evaluation modes: --eval and --eval_backend."""
+
+    @pytest.mark.parametrize("model", VLLM_EVAL_MODELS)
+    def test_eval_mode_hf_backend(self, model):
+        """Test --eval flag: evaluate model without quantization (HF backend default)."""
+        python_path = sys.executable
+
+        cmd = f"{python_path} -m auto_round --model {model} --eval --tasks lambada_openai --limit 100"
+
+        ret = os.system(cmd)
+
+        assert ret == 0, f"HF backend evaluation failed (rc={ret})"
+
+    def test_iters_0_hf_backend(self, tiny_opt_model_path):
+        """Test quantization with iters=0 and HF backend evaluation."""
+        python_path = sys.executable
+
+        cmd = f"{python_path} -m auto_round --model {tiny_opt_model_path} --iters 0 --disable_opt_rtn --tasks lambada_openai --limit 10"
+
+        ret = os.system(cmd)
+
+        assert ret == 0, f"HF backend with iters=0 failed (rc={ret})"
+
+    def test_iters_0_task_by_task(self, tiny_opt_model_path):
+        """Test quantization with iters=0 and task-by-task evaluation."""
+        python_path = sys.executable
+
+        cmd = f"{python_path} -m auto_round --model {tiny_opt_model_path} --iters 0 --disable_opt_rtn --eval_task_by_task --tasks lambada_openai,piqa --limit 10"
+
+        ret = os.system(cmd)
+
+        assert ret == 0, f"Task-by-task with iters=0 failed (rc={ret})"
diff --git a/test/test_cuda/advanced/test_multiple_card.py b/test/test_cuda/advanced/test_multiple_card.py
index 00863c2eb..a8c008a73 100644
--- a/test/test_cuda/advanced/test_multiple_card.py
+++ b/test/test_cuda/advanced/test_multiple_card.py
@@ -297,7 +297,7 @@ def test_device_map_for_triton(self):
         device_map1["model.norm"] = "cuda"
         device_map1["model.rotary_emb"] = "cuda"
         device_map1["model.embed_tokens"] = "cuda"
-        from auto_round import AutoRoundConfig
+        from transformers import AutoRoundConfig
 
         quantization_config = AutoRoundConfig(backend="tritonv2")
 
diff --git a/test/test_cuda/backends/test_exllamav2_backend.py b/test/test_cuda/backends/test_exllamav2_backend.py
index 8d20af99d..d31ac4a2f 100644
--- a/test/test_cuda/backends/test_exllamav2_backend.py
+++ b/test/test_cuda/backends/test_exllamav2_backend.py
@@ -2,9 +2,11 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import (
+    AutoRound,
+)
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut
 
diff --git a/test/test_cuda/backends/test_marlin_backend.py b/test/test_cuda/backends/test_marlin_backend.py
index 793fe3bca..0fcce5e22 100644
--- a/test/test_cuda/backends/test_marlin_backend.py
+++ b/test/test_cuda/backends/test_marlin_backend.py
@@ -2,9 +2,9 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
 from ...helpers import get_model_path, model_infer
diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py
index 4594667d9..28df641a3 100644
--- a/test/test_cuda/backends/test_torch_backend.py
+++ b/test/test_cuda/backends/test_torch_backend.py
@@ -2,9 +2,9 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py
index f51e8aeba..fb67ad049 100644
--- a/test/test_cuda/backends/test_triton_backend.py
+++ b/test/test_cuda/backends/test_triton_backend.py
@@ -2,9 +2,9 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_greater_than_050
 
diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py
index a2753605b..945a3d653 100644
--- a/test/test_cuda/export/test_auto_round_format.py
+++ b/test/test_cuda/export/test_auto_round_format.py
@@ -4,9 +4,9 @@
 import pytest
 import torch
 import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import (
     require_autogptq,
diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py
index efd519a51..acbe19482 100644
--- a/test/test_cuda/export/test_export.py
+++ b/test/test_cuda/export/test_export.py
@@ -124,7 +124,7 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader):
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
-        from auto_round import AutoRoundConfig
+        from transformers import AutoRoundConfig
 
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", trust_remote_code=True, quantization_config=AutoRoundConfig()
@@ -219,7 +219,7 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader):
         )
         quantized_model_path = "./saved/test_export"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
-        from auto_round import AutoRoundConfig
+        from transformers import AutoRoundConfig
 
         model = AutoModelForCausalLM.from_pretrained(
             quantized_model_path, device_map="auto", quantization_config=AutoRoundConfig()
@@ -256,7 +256,7 @@ def test_autoround_3bit_asym_torch_format(self, tiny_opt_model_path, dataloader)
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:gptqmodel")
 
         device = "auto"  ##cpu, hpu, cuda
-        from auto_round import AutoRoundConfig
+        from transformers import AutoRoundConfig
 
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map=device)
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
@@ -282,7 +282,7 @@ def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader):
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
         device = "auto"  ##cpu, hpu, cuda
-        from auto_round import AutoRoundConfig
+        from transformers import AutoRoundConfig
 
         quantization_config = AutoRoundConfig(backend=device)
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/integrations/test_vllm.py b/test/test_cuda/integrations/test_vllm.py
index a653ced16..686640573 100644
--- a/test/test_cuda/integrations/test_vllm.py
+++ b/test/test_cuda/integrations/test_vllm.py
@@ -7,10 +7,6 @@
 Run `pytest test/test_cuda/test_vllm.py`.
 """
 
-import os
-import shutil
-import subprocess
-
 import pytest
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
@@ -36,7 +32,13 @@ def test_auto_round(model):
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     # Create an LLM.
     QUANTIZATION = "auto-round"
-    llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
+    llm = LLM(
+        model=model,
+        quantization=QUANTIZATION,
+        trust_remote_code=True,
+        tensor_parallel_size=1,
+        allow_deprecated_quantization=True,
+    )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
@@ -47,31 +49,3 @@ def test_auto_round(model):
         generated_text = output.outputs[0].text
         if "France" in prompt:
             assert "Paris" in generated_text
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_vllm_lm_eval(model):
-    if shutil.which("auto-round") is None:
-        pytest.skip("auto-round CLI not available")
-
-    env = os.environ.copy()
-    env["VLLM_SKIP_WARMUP"] = "true"
-    env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-    cmd = [
-        "auto-round",
-        "--model",
-        model,
-        "--eval",
-        "--tasks",
-        "lambada_openai",
-        "--eval_bs",
-        "8",
-        "--eval_backend",
-        "vllm",
-        "--limit",
-        "10",
-    ]
-
-    proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-    assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
diff --git a/test/test_cuda/models/test_conv1d.py b/test/test_cuda/models/test_conv1d.py
index 89b82a319..8c1f654a3 100644
--- a/test/test_cuda/models/test_conv1d.py
+++ b/test/test_cuda/models/test_conv1d.py
@@ -36,7 +36,7 @@ def test_quant(self, dataloader):
         model = get_tiny_model(model_name)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
-        from auto_round import AutoRoundConfig
+        from transformers import AutoRoundConfig
 
         autoround = AutoRound(
             model,
diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py
index a339ba69a..cbe0ca19b 100644
--- a/test/test_cuda/models/test_support_vlms.py
+++ b/test/test_cuda/models/test_support_vlms.py
@@ -5,8 +5,8 @@
 import pytest
 import requests
 from PIL import Image
+from transformers import AutoRoundConfig  # # must import for auto-round format
 
-from auto_round import AutoRoundConfig  # # must import for auto-round format
 from auto_round.testing_utils import require_gptqmodel, require_package_version_ut, require_vlm_env
 
 AUTO_ROUND_PATH = __file__.split("/")
diff --git a/test/test_cuda/models/test_vlms.py b/test/test_cuda/models/test_vlms.py
index c8a4adb53..087102da0 100644
--- a/test/test_cuda/models/test_vlms.py
+++ b/test/test_cuda/models/test_vlms.py
@@ -6,8 +6,8 @@
 import pytest
 import requests
 from PIL import Image
+from transformers import AutoRoundConfig
 
-from auto_round import AutoRoundConfig
 from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
 
 
@@ -23,7 +23,7 @@ def teardown_class(self):
 
     # def test_vision_generation(self):
     #     quantized_model_path = "OPEA/Phi-3.5-vision-instruct-qvision-int4-sym-inc"
-    #     from auto_round import AutoRoundConfig
+    #     from transformers import AutoRoundConfig
     #     device = "auto"  ##cpu, hpu, cuda
     #     quantization_config = AutoRoundConfig(
     #         backend=device
diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py
index 12ed75faa..c03d5e593 100644
--- a/test/test_cuda/quantization/test_2_3bits.py
+++ b/test/test_cuda/quantization/test_2_3bits.py
@@ -6,9 +6,9 @@
 import torch
 import transformers
 from lm_eval.utils import make_table  # pylint: disable=E0401
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
 
diff --git a/test/test_cuda/schemes/test_auto_scheme.py b/test/test_cuda/schemes/test_auto_scheme.py
index c7aafe8cc..b91c98428 100644
--- a/test/test_cuda/schemes/test_auto_scheme.py
+++ b/test/test_cuda/schemes/test_auto_scheme.py
@@ -4,8 +4,9 @@
 
 import pytest
 import transformers
+from transformers import AutoRoundConfig
 
-from auto_round import AutoRound, AutoRoundConfig, AutoScheme
+from auto_round import AutoRound, AutoScheme
 from auto_round.auto_scheme.utils import compute_avg_bits_for_model
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.testing_utils import multi_card
diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py
index 16f42bda5..a29bffdac 100644
--- a/test/test_cuda/utils/test_alg_ext.py
+++ b/test/test_cuda/utils/test_alg_ext.py
@@ -3,9 +3,9 @@
 
 import pytest
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
 from ...helpers import get_model_path
diff --git a/test/test_cuda/utils/test_customized_data.py b/test/test_cuda/utils/test_customized_data.py
index d2264ae56..e9ce2207a 100644
--- a/test/test_cuda/utils/test_customized_data.py
+++ b/test/test_cuda/utils/test_customized_data.py
@@ -5,9 +5,9 @@
 import unittest
 
 sys.path.insert(0, "../..")
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 
 
 class TestCustomizedData(unittest.TestCase):
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index 6c9d65c83..fb0dbe1ae 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -4,9 +4,9 @@
 import pytest
 import torch
 import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from auto_round import AutoRound, AutoRoundConfig
+from auto_round import AutoRound
 
 from ..helpers import get_model_path