diff --git a/auto_round/__init__.py b/auto_round/__init__.py index 87c70e06a..509885b33 100644 --- a/auto_round/__init__.py +++ b/auto_round/__init__.py @@ -22,18 +22,4 @@ monkey_patch() - -def __getattr__(name): - if name == "AutoHfQuantizer": - from auto_round.inference.auto_quantizer import AutoHfQuantizer - - return AutoHfQuantizer - if name == "AutoRoundConfig": - from auto_round.inference.auto_quantizer import AutoRoundConfig - - return AutoRoundConfig - - raise AttributeError(f"auto-round has no attribute '{name}'") - - from .version import __version__ diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 1540d254d..5b9616f65 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -17,7 +17,8 @@ from auto_round.auto_scheme import AutoScheme from auto_round.compressors import BaseCompressor -from auto_round.eval.eval_cli import EvalArgumentParser, _eval_init, eval, eval_task_by_task +from auto_round.eval.eval_cli import EvalArgumentParser, eval, eval_task_by_task +from auto_round.eval.evaluation import run_model_evaluation from auto_round.schemes import PRESET_SCHEMES from auto_round.utils import ( clear_memory, @@ -383,6 +384,20 @@ def __init__(self, *args, **kwargs): eval_args.add_argument( "--eval_task_by_task", action="store_true", help="Evaluate tasks sequentially instead of batching. " ) + eval_args.add_argument( + "--eval_backend", + default="hf", + type=str, + choices=["hf", "vllm"], + help="Backend to use for model evaluation. Use hf backend for evaluation by default.", + ) + eval_args.add_argument( + "--vllm_args", + default=None, + type=str, + help="(for vllm) Custom vllm arguments in format: '--arg1=value1,--arg2=value2'. " + "Example: '--tensor_parallel_size=2,--gpu_memory_utilization=0.9'", + ) eval_args.add_argument( "--eval_model_dtype", default=None, @@ -703,185 +718,15 @@ def tune(args): suffix = f"g{autoround.group_size}" export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}") + # ======================= Quantize and save model ======================= model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101 tokenizer = autoround.tokenizer # pylint: disable=E1101 model.eval() clear_memory() - eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") - - # diffusion model has different evaluation path - if getattr(autoround, "diffusion", False): - pipe = autoround.pipe - pipe.to(model.dtype) - pipe.transformer = model - device_str = detect_device(device_str) - pipe = pipe.to(device_str) - if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto": - pipe.to(getattr(torch, eval_model_dtype)) - - gen_kwargs = { - "guidance_scale": args.guidance_scale, - "output_type": "pil", - "num_inference_steps": args.num_inference_steps, - "generator": ( - None - if args.generator_seed is None - else torch.Generator(device=pipe.device).manual_seed(args.generator_seed) - ), - } - if not os.path.exists(args.image_save_dir): - os.makedirs(args.image_save_dir) - - if args.prompt is not None: - outputs = pipe(prompt=args.prompt, **gen_kwargs) - outputs.images[0].save(os.path.join(args.image_save_dir, "img.png")) - logger.info( - f"Image generated with prompt {args.prompt} is saved as {os.path.join(args.image_save_dir, 'img.png')}" - ) - - if args.prompt_file is not None: - from auto_round.compressors.diffusion import diffusion_eval - - metrics = args.metrics.split(",") - diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs) - return - - lm_eval_version = get_library_version("lm-eval") - - eval_folder = folders[-1] - if args.tasks is None or args.tasks == "" or eval_folder is None: - return - - tasks = args.tasks - if isinstance(tasks, str): - tasks = tasks.split(",") - - from lm_eval.utils import make_table # pylint: disable=E0401 - - logger.info(f"Using lm-eval version {lm_eval_version}") - eval_gguf_model = False - for file in os.listdir(eval_folder): - if file.endswith("gguf"): - eval_gguf_model = True - break - - import time - - if "llama" in args.model.lower() and not args.add_bos_token: - logger.warning("set add_bos_token=True for llama model.") - args.add_bos_token = True - if (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model: - if eval_gguf_model: - # for file in os.listdir(eval_folder): - # gguf_file = file - gguf_file = None - gguf_format = None # Initialize gguf_format to None - # gguf folder only contains one file - for format in formats: - if format.startswith("gguf"): - gguf_format = format.split(":")[-1].upper() - if gguf_format is None: # Validate gguf_format after the loop - logger.error("No valid gguf format found in formats. Please check the input.") - sys.exit(-1) - for file in os.listdir(eval_folder): - if gguf_format in file: - gguf_file = file - - logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") - if eval_model_dtype == "float32" or eval_model_dtype == "auto": - logger.warning( - "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," - " but may affect accuracy." - ) - if gguf_file is None: - logger.error("Cannot find correct gguf file for evaluation, please check.") - sys.exit(-1) - model = AutoModelForCausalLM.from_pretrained( - eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype - ) - model.eval() - tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file) - else: - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: - from accelerate.big_modeling import dispatch_model - - dispatch_model(model, model.hf_device_map) - else: - device_str = detect_device(device_str) - model = model.to(device_str) - if model.dtype != eval_model_dtype and eval_model_dtype != "auto": - model.to(getattr(torch, eval_model_dtype)) - - if args.eval_task_by_task: - eval_task_by_task( - model, - tokenizer=tokenizer, - device=device_str, - tasks=args.tasks, - limit=args.limit, - batch_size=args.eval_bs, - eval_model_dtype=eval_model_dtype, - add_bos_token=args.add_bos_token, - ) - else: - if args.eval_bs is None or args.eval_bs == "auto": - logger.warning("This API does not support auto currently, reset eval_bs to 16") - args.eval_bs = 16 - from auto_round.eval.evaluation import simple_evaluate_user_model - - st = time.time() - - res = simple_evaluate_user_model( - model, - tokenizer, - tasks=tasks, - batch_size=args.eval_bs, - limit=args.limit, - device=device_str, - eval_model_dtype=eval_model_dtype, - add_bos_token=args.add_bos_token, - ) - print(make_table(res)) - print("evaluation running time=%ds" % (time.time() - st)) - else: - if args.eval_task_by_task: - eval_task_by_task( - eval_folder, - device=device_str, - tasks=args.tasks, - batch_size=args.eval_bs, - limit=args.limit, - eval_model_dtype=eval_model_dtype, - mllm=autoround.mllm, # pylint: disable=E1101 - add_bos_token=args.add_bos_token, - ) - else: - from auto_round.eval.evaluation import simple_evaluate - - tasks, model_args, device_str = _eval_init( - args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype - ) - st = time.time() - model_args += f",add_bos_token={args.add_bos_token}" - if autoround.mllm: # pylint: disable=E1101 - model_type = "hf-multimodal" - if args.eval_bs is None or args.eval_bs == "auto": - logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") - args.eval_bs = 16 - else: - model_type = "hf" - res = simple_evaluate( - model=model_type, - model_args=model_args, - tasks=tasks, - device=device_str, - batch_size=args.eval_bs, - limit=args.limit, - ) - print(make_table(res)) - print("evaluation running time=%ds" % (time.time() - st)) + # ======================= Model evaluation ======================= + run_model_evaluation(model, tokenizer, autoround, folders, formats, device_str, args) def setup_eval_parser(): @@ -909,6 +754,7 @@ def run_eval(): eval_task_by_task( model=args.model, device=args.device_map, + limit=args.limit, tasks=args.tasks, batch_size=args.eval_bs, trust_remote_code=not args.disable_trust_remote_code, diff --git a/auto_round/compressors/mllm/eval.py b/auto_round/compressors/mllm/eval.py index 339bb0fd7..527ea2be4 100644 --- a/auto_round/compressors/mllm/eval.py +++ b/auto_round/compressors/mllm/eval.py @@ -100,11 +100,6 @@ def mllm_eval( mode: str = "all", ignore: bool = False, ): - try: - from transformers import AutoRoundConfig - except: - from auto_round.inference.auto_quantizer import AutoHfQuantizer - model = None if data_store_dir is not None: if not os.path.exists(data_store_dir): diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 0e31209ad..863a9110c 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -19,8 +19,8 @@ from transformers.utils.versions import require_version from auto_round.utils import ( - clear_memory, get_device_and_parallelism, + get_device_str, get_model_dtype, set_cuda_visible_devices, ) @@ -52,10 +52,9 @@ def __init__(self, *args, **kwargs): "--devices", default="0", type=str, - help="the device to be used for tuning. " - "Currently, device settings support CPU, GPU, and HPU." - "The default is set to cuda:0," - "allowing for automatic detection and switch to HPU or CPU." + help="the device to be used for evaluation. " + "The default is set to 0," + "allowing for automatic detection and switch to any devices." "set --device 0,1,2 to use multiple cards.", ) @@ -105,28 +104,13 @@ def __init__(self, *args, **kwargs): help="Backend to use for model evaluation. Use hf backend for evaluation by default.", ) self.add_argument("--add_bos_token", action="store_true", help="add BOS token") - - # vllm related arguments - vllm_args = self.add_argument_group("vllm backend arguments") - vllm_args.add_argument("--revision", default=None, type=str, help="model revision for vllm") - vllm_args.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm") - vllm_args.add_argument( - "--tokenizer_mode", default="auto", type=str, help="tokenizer mode for vllm (e.g. auto/fast/slow)" - ) - vllm_args.add_argument("--tokenizer_revision", default=None, type=str, help="tokenizer revision for vllm") - vllm_args.add_argument("--prefix_token_id", default=None, type=int, help="prefix token id for vllm") - vllm_args.add_argument("--tensor_parallel_size", default=1, type=int, help="tensor parallel size for vllm") - vllm_args.add_argument("--data_parallel_size", default=1, type=int, help="data parallel size for vllm") - vllm_args.add_argument("--quantization", default=None, type=str, help="quantization setting for vllm") - vllm_args.add_argument("--max_gen_toks", default=256, type=int, help="max generation tokens for vllm") - vllm_args.add_argument("--swap_space", default=4, type=float, help="swap space (GB) for vllm") - vllm_args.add_argument("--max_batch_size", default=None, type=int, help="max batch size for vllm") - vllm_args.add_argument("--max_length", default=None, type=int, help="max generation length for vllm") - vllm_args.add_argument("--max_model_len", default=None, type=int, help="maximum model sequence length for vllm") - vllm_args.add_argument( - "--gpu_memory_utilization", default=0.9, type=float, help="target GPU memory utilization for vllm" + self.add_argument( + "--vllm_args", + default=None, + type=str, + help="(for vllm) Custom vllm arguments in format: 'arg1=value1,arg2=value2'. " + "Example: 'tensor_parallel_size=2,gpu_memory_utilization=0.9'", ) - vllm_args.add_argument("--lora_local_path", default=None, type=str, help="local LoRA path for vllm") def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype="auto"): @@ -162,36 +146,12 @@ def eval(args): if (batch_size := args.eval_bs) is None: batch_size = "auto:8" - is_gguf_file = False - if os.path.exists(args.model): - if os.path.isfile(args.model) and args.model.endswith(".gguf"): - is_gguf_file = True - gguf_file = os.path.basename(args.model) - model = os.path.dirname(args.model) - else: - for file in os.listdir(args.model): - if file.endswith(".gguf"): - is_gguf_file = True - gguf_file = file - model = args.model - eval_model_dtype = get_model_dtype(args.eval_model_dtype) + + model, tokenizer, is_gguf_file, gguf_file = _load_gguf_model_if_needed(args.model, args.eval_model_dtype) + if is_gguf_file: - import torch from lm_eval.utils import make_table # pylint: disable=E0401 - from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file) - - logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") - if eval_model_dtype == "float32" or eval_model_dtype == "auto": - logger.warning( - "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," - " but may affect accuracy." - ) - model = AutoModelForCausalLM.from_pretrained( - model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype - ) - model.eval() st = time.time() res = simple_evaluate_user_model( model, @@ -224,6 +184,79 @@ def eval(args): print("evaluation running time=%ds" % (time.time() - st)) +def eval_with_vllm(args): + import time + + from lm_eval import evaluator # pylint: disable=E0401 + from lm_eval.models.vllm_causallms import VLLM # pylint: disable=E0401 + from lm_eval.models.vllm_vlms import VLLM_VLM # pylint: disable=E0401 + from lm_eval.utils import make_table # pylint: disable=E0401 + + st = time.time() + os.environ["TOKENIZERS_PARALLELISM"] = "false" + device_str, _ = get_device_and_parallelism(args.device_map) + eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") + if (batch_size := args.eval_bs) is None: + batch_size = "auto:8" + if isinstance(args.tasks, str): + tasks = args.tasks.split(",") + + # Parse custom vllm_args if provided + custom_vllm_kwargs = parse_vllm_args(getattr(args, "vllm_args", None)) + + # Build vllm kwargs with base parameters + vllm_kwargs = { + "pretrained": args.model, + "dtype": eval_model_dtype, + "trust_remote_code": not args.disable_trust_remote_code, + "add_bos_token": args.add_bos_token, + "device": device_str, + "batch_size": batch_size, + "allow_deprecated_quantization": True, # for vLLM==0.14.0 + } + + # Override with custom vllm_args if provided + if custom_vllm_kwargs: + from auto_round.logger import logger + + logger.info(f"Overriding VLLM parameters with custom args: {custom_vllm_kwargs}") + vllm_kwargs.update(custom_vllm_kwargs) + + device = get_device_str() + environ_mapping = { + "cuda": "CUDA_VISIBLE_DEVICES", + "xpu": "ZE_AFFINITY_MASK", + "hpu": "HABANA_VISIBLE_MODULES", + } + if "tensor_parallel_size" not in vllm_kwargs: + # Parse device_map to determine tensor_parallel_size and set CUDA_VISIBLE_DEVICES + # Only accept formats like "0" or "0,1,2" + assert device in environ_mapping, f"Device {device} not supported for vllm tensor parallelism." + environ_name = environ_mapping[device] + device_map = args.device_map + device_ids = [d.strip() for d in str(device_map).split(",") if d.strip().isdigit()] + if device_ids: + device_id_str = ",".join(device_ids) + os.environ[environ_name] = device_id_str + tensor_parallel_size = len(device_ids) + vllm_kwargs["tensor_parallel_size"] = tensor_parallel_size + from auto_round.logger import logger + + logger.info( + f"Set {environ_name}={os.environ[environ_name]}, " f"tensor_parallel_size={tensor_parallel_size}" + ) + + vllm_lm = VLLM_VLM(**vllm_kwargs) if args.mllm else VLLM(**vllm_kwargs) + res = evaluator.simple_evaluate( + model=vllm_lm, + tasks=tasks, + limit=args.limit, + ) + + print(make_table(res)) + print("evaluation running time=%ds" % (time.time() - st)) + + def eval_task_by_task( model, device=None, @@ -256,34 +289,17 @@ def eval_task_by_task( if batch_size is None: batch_size = "auto:8" - is_gguf_file = False + if not isinstance(model, str): parallelism = False + is_gguf_file = False + gguf_file = None else: - if os.path.isfile(model) and model.endswith(".gguf"): - is_gguf_file = True - gguf_file = os.path.basename(model) - model = os.path.dirname(model) - else: - for file in os.listdir(model): - if file.endswith(".gguf"): - is_gguf_file = True - gguf_file = file - eval_model_dtype = get_model_dtype(eval_model_dtype) - if is_gguf_file: - tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file) - logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") - if eval_model_dtype == "float32" or eval_model_dtype == "auto": - logger.warning( - "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," - " but may affect accuracy." - ) + model, tokenizer, is_gguf_file, gguf_file = _load_gguf_model_if_needed(model, eval_model_dtype) + if is_gguf_file: + parallelism = False - model = AutoModelForCausalLM.from_pretrained( - model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype - ) - model.eval() - parallelism = False + eval_model_dtype = get_model_dtype(eval_model_dtype) if mllm: if batch_size is None or batch_size == "auto": logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") @@ -314,18 +330,93 @@ def eval_task_by_task( add_bos_token=add_bos_token, ) - if isinstance(tasks, str): - tasks = tasks.replace(" ", "").split(",") + _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry_times) + +def _load_gguf_model_if_needed(model_path, eval_model_dtype=None): + """Detect and load GGUF model if the path points to a GGUF file. + + Args: + model_path: Path to model or GGUF file + eval_model_dtype: Data type for model evaluation + + Returns: + Tuple of (model, tokenizer, is_gguf_file, gguf_file_name) + If not a GGUF file, returns (model_path, None, False, None) + """ + from auto_round.utils import logger + + is_gguf_file = False + gguf_file = None + tokenizer = None + model = model_path + + # Check if model_path is a string before processing + if isinstance(model_path, str): + if os.path.isfile(model_path) and model_path.endswith(".gguf"): + is_gguf_file = True + gguf_file = os.path.basename(model_path) + model = os.path.dirname(model_path) + elif os.path.exists(model_path): + for file in os.listdir(model_path): + if file.endswith(".gguf"): + is_gguf_file = True + gguf_file = file + break + + if is_gguf_file: + from transformers import AutoModelForCausalLM, AutoTokenizer + + eval_model_dtype = get_model_dtype(eval_model_dtype) + tokenizer = AutoTokenizer.from_pretrained(model, gguf_file=gguf_file) + + logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") + if eval_model_dtype == "float32" or eval_model_dtype == "auto": + logger.warning( + "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," + " but may affect accuracy." + ) + + model = AutoModelForCausalLM.from_pretrained( + model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype + ) + model.eval() + + return model, tokenizer, is_gguf_file, gguf_file + + +def _evaluate_tasks_with_retry(tasks, hflm, device_str, batch_size, limit, retry_times): + """Evaluate tasks with automatic retry on OOM errors. + + Args: + tasks: List of task names to evaluate + hflm: HuggingFace LM model instance + device_str: Device string for evaluation + batch_size: Batch size for evaluation + limit: Limit number of examples per task + retry_times: Number of retry attempts on failure + + Returns: + Aggregated results dictionary containing results, versions, n-shot, and higher_is_better + """ + import time + import traceback + + from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401 from lm_eval.utils import make_table # pylint: disable=E0401 + from auto_round.utils import logger + + if isinstance(tasks, str): + tasks = tasks.replace(" ", "").split(",") + res_all = {} res_keys = ["results", "versions", "n-shot", "higher_is_better"] - import time - st = time.time() + for task in tasks: - while retry_times: + current_retry_times = retry_times + while current_retry_times: try: res = lm_simple_evaluate( model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit @@ -350,7 +441,7 @@ def eval_task_by_task( logger.error(cuda_error_msg) traceback.print_exc() break - retry_times -= 1 + current_retry_times -= 1 if not res_all: res_all = res @@ -365,49 +456,63 @@ def eval_task_by_task( print("total eval time:", time.time() - st) -def eval_with_vllm(args): - import time +def parse_vllm_args(vllm_args_str): + """Parse custom vllm arguments from string format. - from lm_eval import evaluator # pylint: disable=E0401 - from lm_eval.models.vllm_causallms import VLLM # pylint: disable=E0401 - from lm_eval.utils import make_table # pylint: disable=E0401 + Args: + vllm_args_str: String containing vllm arguments in format: + "--arg1=value1,--arg2=value2" or "arg1=value1,arg2=value2" - st = time.time() - os.environ["TOKENIZERS_PARALLELISM"] = "false" - device_str, _ = get_device_and_parallelism(args.device_map) - eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") - if (batch_size := args.eval_bs) is None: - batch_size = "auto:8" + Returns: + Dictionary of parsed arguments with appropriate types (int, float, bool, or string) - vllm_lm = VLLM( - pretrained=args.model, - dtype=eval_model_dtype, - revision=args.revision, - trust_remote_code=not args.disable_trust_remote_code, - tokenizer=args.tokenizer, - tokenizer_mode=args.tokenizer_mode, - tokenizer_revision=args.tokenizer_revision, - add_bos_token=args.add_bos_token, - prefix_token_id=args.prefix_token_id, - tensor_parallel_size=args.tensor_parallel_size, - quantization=args.quantization, - max_gen_toks=args.max_gen_toks, - swap_space=args.swap_space, - batch_size=batch_size, - max_batch_size=args.max_batch_size, - max_length=args.max_length, - max_model_len=args.max_model_len, - seed=args.seed, - gpu_memory_utilization=args.gpu_memory_utilization, - device=device_str, - data_parallel_size=args.data_parallel_size, - lora_local_path=args.lora_local_path, - ) - res = evaluator.simple_evaluate( - model=vllm_lm, - tasks=args.tasks, - limit=args.limit, - ) + Example: + >>> parse_vllm_args("--tensor_parallel_size=2,--gpu_memory_utilization=0.9") + {'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.9} + """ + from auto_round.logger import logger - print(make_table(res)) - print("evaluation running time=%ds" % (time.time() - st)) + custom_vllm_kwargs = {} + + if not vllm_args_str: + return custom_vllm_kwargs + + logger.info(f"Parsing custom vllm arguments: {vllm_args_str}") + + for arg_pair in vllm_args_str.split(","): + arg_pair = arg_pair.strip() + # Normalize: replace space separator with '=' (e.g., "--arg value" -> "--arg=value") + if "=" not in arg_pair and " " in arg_pair: + parts = arg_pair.split(None, 1) # Split on whitespace, max 2 parts + if len(parts) == 2: + arg_pair = f"{parts[0]}={parts[1]}" + if "=" in arg_pair: + # Remove leading '--' if present + arg_pair = arg_pair.removeprefix("--") + key, value = arg_pair.split("=", 1) + key = key.strip() + value = value.strip() + + # Try to convert value to appropriate type + try: + # Try int first + if value.isdigit() or (value.startswith("-") and value[1:].isdigit()): + custom_vllm_kwargs[key] = int(value) + # Try float + elif "." in value: + custom_vllm_kwargs[key] = float(value) + # Try boolean + elif value.lower() in ("true", "false"): + custom_vllm_kwargs[key] = value.lower() == "true" + # Keep as string + else: + custom_vllm_kwargs[key] = value + logger.info( + f" Parsed vllm arg: {key}={custom_vllm_kwargs[key]}" + + f" (type: {type(custom_vllm_kwargs[key]).__name__})" + ) + except Exception as e: + logger.warning(f" Failed to parse vllm arg '{key}={value}': {e}, keeping as string") + custom_vllm_kwargs[key] = value + + return custom_vllm_kwargs diff --git a/auto_round/eval/evaluation.py b/auto_round/eval/evaluation.py index 90f489ccf..1f4df7b69 100644 --- a/auto_round/eval/evaluation.py +++ b/auto_round/eval/evaluation.py @@ -15,14 +15,10 @@ import os from typing import Optional, Union -from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401 - from auto_round.logger import logger os.environ["TOKENIZERS_PARALLELISM"] = "false" -from lm_eval.models.huggingface import HFLM # pylint: disable=E0401 - def simple_evaluate_user_model( user_model, @@ -33,8 +29,11 @@ def simple_evaluate_user_model( eval_model_dtype="auto", add_bos_token: bool = False, mllm: bool = False, - **kwargs + **kwargs, ): + from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401 + from lm_eval.models.huggingface import HFLM # pylint: disable=E0401 + if mllm: from lm_eval.models.hf_vlms import HFMultimodalLM # pylint: disable=E0401 @@ -70,12 +69,9 @@ def simple_evaluate( limit: Optional[Union[int, float]] = None, max_batch_size: Optional[int] = None, device: Optional[str] = None, - **kwargs + **kwargs, ): - try: - from transformers import AutoRoundConfig - except: - from auto_round.inference.auto_quantizer import AutoHfQuantizer + from lm_eval import simple_evaluate as lm_simple_evaluate # pylint: disable=E0401 return lm_simple_evaluate( model=model, @@ -84,5 +80,357 @@ def simple_evaluate( limit=limit, max_batch_size=max_batch_size, device=device, - **kwargs + **kwargs, + ) + + +def evaluate_diffusion_model(autoround, model, args): + """ + Evaluate diffusion models. + + Args: + autoround: AutoRound instance + model: Quantized model + args: Command line arguments + """ + import torch + + from auto_round.utils import detect_device, get_model_dtype, logger + + # Prepare inference pipeline + pipe = autoround.pipe + pipe.to(model.dtype) + pipe.transformer = model + device_str = detect_device(args.device_map if hasattr(args, "device_map") else "0") + pipe = pipe.to(device_str) + + # Set evaluation dtype + eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") + if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto": + pipe.to(getattr(torch, eval_model_dtype)) + + # Prepare generation kwargs + gen_kwargs = { + "guidance_scale": args.guidance_scale, + "output_type": "pil", + "num_inference_steps": args.num_inference_steps, + "generator": ( + None + if args.generator_seed is None + else torch.Generator(device=pipe.device).manual_seed(args.generator_seed) + ), + } + + # Create image save directory + if not os.path.exists(args.image_save_dir): + os.makedirs(args.image_save_dir) + + # Single prompt generation + if args.prompt is not None: + outputs = pipe(prompt=args.prompt, **gen_kwargs) + save_path = os.path.join(args.image_save_dir, "img.png") + outputs.images[0].save(save_path) + logger.info(f"Image generated with prompt {args.prompt} is saved as {save_path}") + + # Batch prompt evaluation + if args.prompt_file is not None: + from auto_round.compressors.diffusion import diffusion_eval + + metrics = args.metrics.split(",") + diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs) + + +def load_gguf_model_for_eval(eval_folder, formats, args): + """ + Load GGUF model for evaluation. + + Args: + eval_folder: Path to saved model + formats: List of export formats + args: Command line arguments + + Returns: + model, tokenizer: Loaded model and tokenizer + """ + import sys + + from transformers import AutoModelForCausalLM, AutoTokenizer + + from auto_round.utils import get_model_dtype, logger + + # Find corresponding GGUF format + gguf_format = None + for format in formats: + if format.startswith("gguf"): + gguf_format = format.split(":")[-1].upper() + break + + if gguf_format is None: + logger.error("No valid gguf format found in formats. Please check the input.") + sys.exit(-1) + + # Find matching GGUF file + gguf_file = None + for file in os.listdir(eval_folder): + if gguf_format in file: + gguf_file = file + break + + if gguf_file is None: + logger.error("Cannot find correct gguf file for evaluation, please check.") + sys.exit(-1) + + # Load model and tokenizer + logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") + eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") + + if eval_model_dtype in ["float32", "auto"]: + logger.warning( + "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," + " but may affect accuracy." + ) + + model = AutoModelForCausalLM.from_pretrained( + eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype ) + model.eval() + tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file) + + return model, tokenizer + + +def prepare_model_for_eval(model, device_str, eval_model_dtype): + """ + Prepare model for evaluation. + + Args: + model: Quantized model + device_str: Device string + eval_model_dtype: Evaluation data type + + Returns: + model: Prepared model + """ + import torch + + from auto_round.utils import detect_device + + # Handle multi-device model + if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: + from accelerate.big_modeling import dispatch_model + + dispatch_model(model, model.hf_device_map) + else: + # Single device model + device_str = detect_device(device_str) + model = model.to(device_str) + + # Convert dtype + if model.dtype != eval_model_dtype and eval_model_dtype != "auto": + model.to(getattr(torch, eval_model_dtype)) + + return model + + +def evaluate_with_model_instance(model, tokenizer, device_str, args): + """ + Evaluate with model instance. + Applicable to fake quantization and GGUF models. + + Args: + model: Model instance + tokenizer: Tokenizer + device_str: Device string + args: Command line arguments + """ + import time + + from lm_eval.utils import make_table # pylint: disable=E0401 + + from auto_round.eval.eval_cli import eval_task_by_task + from auto_round.utils import get_model_dtype, logger + + tasks = args.tasks + if isinstance(tasks, str): + tasks = tasks.split(",") + + # Task-by-task evaluation + if args.eval_task_by_task: + eval_task_by_task( + model, + tokenizer=tokenizer, + device=device_str, + tasks=args.tasks, + limit=args.limit, + batch_size=args.eval_bs, + eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"), + add_bos_token=args.add_bos_token, + ) + else: + # Batch evaluation + eval_bs = args.eval_bs + if eval_bs is None or eval_bs == "auto": + logger.warning("This API does not support auto currently, reset eval_bs to 16") + eval_bs = 16 + + st = time.time() + res = simple_evaluate_user_model( + model, + tokenizer, + tasks=tasks, + batch_size=eval_bs, + limit=args.limit, + device=device_str, + eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"), + add_bos_token=args.add_bos_token, + ) + print(make_table(res)) + print("evaluation running time=%ds" % (time.time() - st)) + + +def evaluate_with_model_path(eval_folder, device_str, autoround, args): + """ + Evaluate with model path. + Applicable to other quantization formats. + + Args: + eval_folder: Path to saved model + device_str: Device string + autoround: AutoRound instance + args: Command line arguments + """ + import time + + from lm_eval.utils import make_table # pylint: disable=E0401 + + from auto_round.eval.eval_cli import _eval_init, eval_task_by_task + from auto_round.utils import get_model_dtype, logger + + tasks = args.tasks + if isinstance(tasks, str): + tasks = tasks.split(",") + + # Task-by-task evaluation + if args.eval_task_by_task: + eval_task_by_task( + eval_folder, + device=device_str, + tasks=args.tasks, + batch_size=args.eval_bs, + limit=args.limit, + eval_model_dtype=get_model_dtype(args.eval_model_dtype, "auto"), + mllm=autoround.mllm, + add_bos_token=args.add_bos_token, + ) + else: + # Batch evaluation + tasks, model_args, device_str = _eval_init( + args.tasks, + eval_folder, + args.device_map, + args.disable_trust_remote_code, + dtype=get_model_dtype(args.eval_model_dtype, "auto"), + ) + + st = time.time() + model_args += f",add_bos_token={args.add_bos_token}" + + # Choose evaluation method based on model type + if autoround.mllm: + model_type = "hf-multimodal" + eval_bs = args.eval_bs + if eval_bs is None or eval_bs == "auto": + logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") + eval_bs = 16 + else: + model_type = "hf" + eval_bs = args.eval_bs + + res = simple_evaluate( + model=model_type, + model_args=model_args, + tasks=tasks, + device=device_str, + batch_size=eval_bs, + limit=args.limit, + ) + print(make_table(res)) + print("evaluation running time=%ds" % (time.time() - st)) + + +def run_model_evaluation(model, tokenizer, autoround, folders, formats, device_str, args): + """ + Run model evaluation. + Unified evaluation entry point that dispatches to different evaluation logic based on model type. + + Args: + model: Quantized model + tokenizer: Tokenizer + autoround: AutoRound instance + folders: List of export folders + formats: List of export formats + device_str: Device string + args: Command line arguments + """ + from auto_round.utils import get_library_version, get_model_dtype, logger + + # Handle diffusion models separately + if getattr(autoround, "diffusion", False): + evaluate_diffusion_model(autoround, model, args) + return + + # Check if evaluation is needed for language models + eval_folder = folders[-1] if folders else None + if args.tasks is None or args.tasks == "" or eval_folder is None: + return + + # Handle vllm backend evaluation + if hasattr(args, "eval_backend") and args.eval_backend == "vllm": + from auto_round.eval.eval_cli import eval_with_vllm + + # Create a minimal args object with essential parameters + vllm_args = type("Args", (), {})() + # Required parameters + vllm_args.model = eval_folder + vllm_args.tasks = args.tasks + vllm_args.device_map = getattr(args, "device_map", device_str) + # Optional common parameters + vllm_args.eval_bs = getattr(args, "eval_bs", None) + vllm_args.mllm = getattr(args, "mllm", None) + vllm_args.limit = getattr(args, "limit", None) + vllm_args.eval_model_dtype = getattr(args, "eval_model_dtype", None) + vllm_args.disable_trust_remote_code = getattr(args, "disable_trust_remote_code", False) + vllm_args.add_bos_token = getattr(args, "add_bos_token", False) + vllm_args.seed = getattr(args, "seed", 42) + # VLLM-specific parameters + vllm_args.vllm_args = getattr(args, "vllm_args", None) + eval_with_vllm(vllm_args) + return + + lm_eval_version = get_library_version("lm-eval") + logger.info(f"Using lm-eval version {lm_eval_version}") + + # Handle Llama model special case + if "llama" in args.model.lower() and not args.add_bos_token: + logger.warning("set add_bos_token=True for llama model.") + args.add_bos_token = True + + # Check if GGUF model + eval_gguf_model = any(file.endswith("gguf") for file in os.listdir(eval_folder)) + + # Determine if model instance evaluation is needed + need_model_instance = (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model + + if need_model_instance: + # Load or prepare model instance + if eval_gguf_model: + model, tokenizer = load_gguf_model_for_eval(eval_folder, formats, args) + else: + eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") + model = prepare_model_for_eval(model, device_str, eval_model_dtype) + + # Evaluate with model instance + evaluate_with_model_instance(model, tokenizer, device_str, args) + else: + # Evaluate with model path + evaluate_with_model_path(eval_folder, device_str, autoround, args) diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py deleted file mode 100644 index 33ab74d8d..000000000 --- a/auto_round/inference/auto_quantizer.py +++ /dev/null @@ -1,379 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. team and GPTQ and AutoGPTQ authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import importlib.util -import warnings -from dataclasses import dataclass -from enum import Enum -from logging import getLogger -from typing import Any, Dict, List, Optional, Tuple, Union - -import torch -import torch.nn as nn -from packaging import version -from transformers.modeling_utils import PreTrainedModel -from transformers.quantizers import AutoQuantizationConfig, HfQuantizer -from transformers.quantizers.auto import AUTO_QUANTIZER_MAPPING -from transformers.utils.quantization_config import AwqConfig, GPTQConfig, QuantizationConfigMixin, QuantizationMethod - -from auto_round.inference.convert_model import convert_hf_model, infer_target_device, post_init -from auto_round.utils import is_hpex_available - -logger = getLogger(__name__) -import sys - -if sys.version_info < (3, 8): - import importlib_metadata -else: - import importlib.metadata as importlib_metadata - -AUTOROUND_MINIMUM_VERSION = version.parse("0.2") - - -def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: - # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version - try: # TODO remove it later - import auto_round - - return True, auto_round.__version__ - except: - pass - - package_exists = importlib.util.find_spec(pkg_name) is not None - package_version = "N/A" - if package_exists: - try: - package_version = importlib.metadata.version(pkg_name) - package_exists = True - except importlib.metadata.PackageNotFoundError: - package_exists = False - if return_version: - return package_exists, package_version - else: - return package_exists - - -_auto_round_available = _is_package_available("auto_round") - - -def is_auto_round_available(): - try: - import auto_round - - return True - except: - pass - if _auto_round_available: - version_autoround = version.parse(importlib_metadata.version("auto_round")) - if AUTOROUND_MINIMUM_VERSION < version_autoround: - return True - else: - raise ImportError( - f"Found an incompatible version of auto-round. Found version {version_autoround}," - f" but only version above {AUTOROUND_MINIMUM_VERSION} are supported" - ) - - -class AutoHfQuantizer: - """The Auto-HF quantizer class that takes care of automatically instantiating to the correct - `HfQuantizer` given the `QuantizationConfig`.""" - - @classmethod - def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict], **kwargs): - # Convert it to a QuantizationConfig if the q_config is a dict - if isinstance(quantization_config, dict): - if "auto-round" in quantization_config["quant_method"]: - quantization_config = AutoRoundConfig.from_dict(quantization_config) - else: - quantization_config = AutoQuantizationConfig.from_dict(quantization_config) # pylint: disable=E1101 - quant_method = quantization_config.quant_method - - # Again, we need a special care for bnb as we have a single quantization config - # class for both 4-bit and 8-bit quantization - if quant_method == QuantizationMethod.BITS_AND_BYTES: - if quantization_config.load_in_8bit: - quant_method += "_8bit" - else: - quant_method += "_4bit" - - if quant_method not in AUTO_QUANTIZER_MAPPING.keys() and "auto-round" not in quant_method: - raise ValueError( - f"Unknown quantization type, got {quant_method} - supported types are:" - f" {list(AUTO_QUANTIZER_MAPPING.keys())}" - ) - if "auto-round" in quant_method or is_hpex_available(): # pragma: no cover - target_cls = AutoRoundQuantizer - else: - target_cls = AUTO_QUANTIZER_MAPPING[quant_method] - - return target_cls(quantization_config, **kwargs) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - quantization_config = AutoQuantizationConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - return cls.from_config(quantization_config) - - @classmethod - def merge_quantization_configs( - cls, - quantization_config: Union[dict, QuantizationConfigMixin], - quantization_config_from_args: Optional[QuantizationConfigMixin], - ): - """Handles situations where both quantization_config - from args and quantization_config from model config are present.""" - if quantization_config_from_args is not None: - warning_msg = ( - "You passed `quantization_config` or equivalent parameters to " - "`from_pretrained` but the model you're loading" - " already has a `quantization_config` attribute. The `quantization_config` from the model will be used." - ) - else: - warning_msg = "" - if quantization_config_from_args is None or not hasattr( - quantization_config_from_args, "get_loading_attributes" - ): - # If the quantization_config_from_args is None or does not have get_loading_attributes method, - # we will not use it to load the model. - quantization_config_from_args = None - else: - loading_attr_dict = quantization_config_from_args.get_loading_attributes() - - if isinstance(quantization_config, dict): - if ( - "auto-round" in quantization_config["quant_method"] - or quantization_config_from_args.__class__.__name__ == "AutoRoundConfig" - ): - quantization_config = AutoRoundConfig.from_dict(quantization_config) - else: - quantization_config = AutoQuantizationConfig.from_dict(quantization_config) # pylint: disable=E1101 - - if ( - isinstance(quantization_config, (GPTQConfig, AwqConfig, AutoRoundConfig)) - and quantization_config_from_args is not None - ): - # special case for GPTQ / AWQ config collision - - for attr, val in loading_attr_dict.items(): - setattr(quantization_config, attr, val) - warning_msg += ( - f"However, loading attributes (e.g. {list(loading_attr_dict.keys())}) " - f"will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored." - ) - - if warning_msg != "": - warnings.warn(warning_msg) - - return quantization_config - - @staticmethod - def supports_quant_method(quantization_config_dict): - from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING - - AUTO_QUANTIZATION_CONFIG_MAPPING["auto-round"] = AutoRoundConfig - AUTO_QUANTIZATION_CONFIG_MAPPING["auto_round"] = AutoRoundConfig - quant_method = quantization_config_dict.get("quant_method", None) - if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False): - suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit" - quant_method = QuantizationMethod.BITS_AND_BYTES + suffix - elif quant_method is None: - raise ValueError( - "The model's quantization config from the arguments has no `quant_method` attribute." - "Make sure that the model has been correctly quantized" - ) - - if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys(): - logger.warning( - f"Unknown quantization type, got {quant_method} - supported types are:" - f" {list(AUTO_QUANTIZER_MAPPING.keys())}. Hence, we will skip the quantization. " - "To remove the warning, you can delete the quantization_config attribute in config.json" - ) - return False - return True - - -class AutoRoundQuantizationMethod(str, Enum): - AutoRound = "auto-round" - - -@dataclass -class AutoRoundConfig(QuantizationConfigMixin): - """This is a wrapper class about all possible attributes and features that you can play with a model that has been - loaded AutoRound quantization. - - Args: - bits (`int`): - The number of bits to quantize to, supported numbers are (2, 3, 4, 8). - tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*): - The tokenizer used to process the dataset. You can pass either: - - A custom tokenizer object. - - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. - - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved - using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. - """ - - def __init__( - self, - bits: int = 4, - tokenizer: Any = None, - dataset: str = None, - group_size: int = 128, - sym: bool = False, - backend="auto", - layer_config: dict = None, - **kwargs, - ): - - self.bits = bits - self.tokenizer = tokenizer - self.dataset = dataset - self.group_size = group_size - self.sym = sym - self.packing_format = "auto_round:auto_gptq" - self.backend = backend - self.layer_config = layer_config - if kwargs is not None: - for key in kwargs.keys(): - setattr(self, key, kwargs[key]) - self.quant_method = AutoRoundQuantizationMethod.AutoRound - self.post_init() - - def post_init(self): - r"""Safety checker that arguments are correct.""" - if self.bits not in [2, 3, 4, 8]: - raise ValueError(f"Only support quantization to [2,3,4,8] bits but found {self.bits}") - if self.group_size != -1 and self.group_size <= 0: - raise ValueError("group_size must be greater than 0 or equal to -1") - - def get_loading_attributes(self): - loading_attributes_dict = {"backend": self.backend} - return loading_attributes_dict - - def to_dict(self): - config_dict = super().to_dict() - return config_dict - - @classmethod - def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs): - quant_method = config_dict["quant_method"] - if "auto-round" not in quant_method and "gptq" not in quant_method and "awq" not in quant_method: - raise NotImplementedError( - "Failed to convert to auto_round format. Only `gptqv1`, `awq`, and `auto-round` formats are supported." - ) - - if "gptq" in quant_method and "meta" in config_dict: - raise NotImplementedError("Failed to convert gptq format to auto_round format. Only supports `gptqv1`") - - if "awq" in quant_method and config_dict.get("version", "gemm") != "gemm": - raise NotImplementedError( - "Failed to convert awq format to auto_round format. Only supports awq format with gemm version" - ) - - if "auto-round" not in quant_method: - config_dict["packing_format"] = f"auto_round:{quant_method}" - - return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs) - - -class AutoRoundQuantizer(HfQuantizer): - """Quantizer of the AutoRound method, currently only triton and exllamav2 backend has been supported.""" - - requires_calibration = False - required_packages = ["auto_round"] - optimum_quantizer = None - - def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): - self.device_map = None - super().__init__(quantization_config, **kwargs) - - def validate_environment(self, *args, **kwargs): - self.device_map = kwargs.get("device_map", None) - if not is_auto_round_available(): - raise ImportError( - "Loading a AutoRound quantized model requires auto-round library (`pip install " "auto-round`)" - ) - else: - try: - import auto_round - - autoround_version = version.parse(auto_round.__version__) - except: - autoround_version = version.parse(importlib.metadata.version("auto_round")) - if autoround_version < version.parse("0.2.0"): - raise ImportError( - "You need a version of auto_round > 0.2.0 to use AutoRound: `pip install --upgrade " - "auto-round` or install from source" - ) - - def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": - if torch_dtype is None: - torch_dtype = torch.bfloat16 - return torch_dtype - - def post_init_model(self, model): - """Post-initialization that require device information, for example buffers initialization on device. - - Args: - model (`nn.Module`): - The input model - """ - - class StoreAttr(object): - pass - - model.quantize_config = StoreAttr() - - post_init(model, self.used_backends) - - def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs): - if self.pre_quantized: - target_device = infer_target_device(self.device_map) - model, used_backends = convert_hf_model(model, target_device) - self.used_backends = used_backends - - def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): - if self.pre_quantized: - self.post_init_model(model) - else: - raise NotImplementedError - - @property - def is_trainable(self, model: Optional["PreTrainedModel"] = None): - return True - - @property - def is_serializable(self): - return True - - -import transformers - -if version.parse(transformers.__version__) < version.parse("4.38.0"): - logger.error("Please upgrade transformers>=4.38.0 to support lm-head quantization") - -transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer -transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index de9ea903f..c3dcbf59c 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -31,7 +31,7 @@ import cpuinfo if TYPE_CHECKING: - from auto_quantizer import AutoRoundConfig + from transformers import AutoRoundConfig def get_cpu_manufacturer(): diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 55e64565c..b25a6888e 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -1459,6 +1459,18 @@ def log_summary(self, msg: str = "", level: str = "info"): return summary +def get_device_str(): + """Get a string representation of the automatically detected device.""" + if torch.cuda.is_available(): + return "cuda" + elif torch.xpu.is_available(): # pragma: no cover + return "xpu" + elif is_hpex_available(): # pragma: no cover + return "hpu" + else: # pragma: no cover + return "cpu" + + # Global singleton instance memory_monitor = MemoryMonitor() diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index cc505e976..eb168774e 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -725,7 +725,14 @@ def module_match_name_list(module, name_list): return any(name.lower() in type(module).__name__.lower() for name in name_list) if module_match_name_list( - module, ["Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "DeepseekMoE", "DeepseekV2MoE", "DeepseekV3MoE"] + module, + [ + "Qwen2MoeSparseMoeBlock", + "Qwen3MoeSparseMoeBlock", + "DeepseekMoE", + "DeepseekV2MoE", + "DeepseekV3MoE", + ], ): return ["gate_proj", "down_proj", "up_proj"] elif module_match_name_list(module, ["MixtralMoeSparseMoeBlock"]): diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 54c47a2f2..40fff76d9 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -9,7 +9,7 @@ This document presents step-by-step instructions for auto-round llm quantization + [Customized Dataset](#customized-dataset) + [Dataset operations](#dataset-operations) * [3 Quantization](#3-quantization) - + [Supported Quantization Configurations](#supported-quantization-configurations) + + [Supported Quantization Schemes](#supported-quantization-schemes) + [Supported Export Formats](#supported-export-formats) + [Hardware Compatibility](#hardware-compatibility) + [Environment Configuration](#environment-configuration) @@ -39,8 +39,9 @@ This document presents step-by-step instructions for auto-round llm quantization + [Specify Inference Backend](#specify-inference-backend) + [Convert GPTQ/AWQ to AutoRound](#convert-gptq-awq-to-autoround) * [5 Evaluation](#5-evaluation) - + [Combine evaluation with tuning](#combine-evaluation-with-tuning) - + [Eval the Quantized model](#eval-the-quantized-model) + + [Single GPU Evaluation](#single-gpu-evaluation) + + [Multi-GPU Evaluation](#multi-gpu-evaluation) + + [Important Notes](#important-notes) * [6 Known Issues](#6-known-issues) ## 1 Prerequisite @@ -129,7 +130,7 @@ AutoRound supports several Schemes: Besides, you could modify the `group_size`, `bits`, `sym` and many other configs you want, though there are maybe no real kernels. -### Supported export Formats +### Supported Export Formats You can use command `auto_round list format` to show all supported formats with support scheme. **AutoRound Format**: This format is well-suited for CPU, Intel GPU, CUDA and HPU devices, 2 bits, as well as mixed-precision @@ -744,47 +745,43 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=Fal ## 5 Evaluation -### Combine evaluation with tuning - -- We leverage lm-eval-harnessing for the evaluation. -If not explicitly specify '--task', the default value will be used (typically covering 10+ common tasks). - ~~~bash - auto-round --model Qwen/Qwen3-0.6B --bits 4 --format "auto_round,auto_gptq" --tasks mmlu - ~~~ - The last format will be used in evaluation if multiple formats have been exported. - - -### Eval the Quantized model - -- AutoRound format - For lm-eval-harness, you could just call - ~~~bash - auto-round --model="your_model_path" --eval --tasks lambada_openai --eval_bs 16 - ~~~ - > Note: To use the vllm backend, add `--eval_backend vllm` to the command above. Common vllm parameters are already supported, such as `--tensor_parallel_size`. - - Multiple gpu evaluation - ~~~bash - auto-round --model="your_model_path" --eval --device 0,1 --tasks lambada_openai --eval_bs 16 - ~~~ - For other evaluation framework, if the framework could support Huggingface models, typically it could support - AutoRound format, only you need to do is import the following in the beginning of your code - ~~~python - from auto_round import AutoRoundConfig - ~~~ - -- AutoGPTQ/AutoAWQ format - - Please refer to their repo and check the evaluation framework's compatibility. - For lm-eval-harness, you could just call - ~~~bash - lm_eval --model hf --model_args pretrained="your_model_path" --device cuda:0 --tasks lambada_openai --batch_size 16 - ~~~ - Multiple gpu evaluation - ~~~bash - CUDA_VISIBLE_DEVICES=0,1 lm_eval --model hf --model_args pretrained="your_model_path",parallelize=True --tasks lambada_openai --batch_size 16 - ~~~ +AutoRound leverages [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness) for evaluation. If `--tasks` is not specified, a set of default tasks (typically 10+ common benchmarks) will be automatically used. +### Single GPU Evaluation + +**HF Backend (default):** +```bash +auto-round --model Qwen/Qwen3-0.6B --bits 4 --format "auto_round,auto_gptq" --tasks mmlu +``` + +**vLLM Backend:** +```bash +auto-round --model Qwen/Qwen3-0.6B --bits 4 --format "auto_round,auto_gptq" --tasks mmlu --eval_backend vllm +``` + +### Multi-GPU Evaluation + +**HF Backend:** +```bash +auto-round --model="your_model_path" --eval --device_map 0,1 --tasks lambada_openai --eval_bs 16 +``` + +**vLLM Backend (Option 1 - using --device_map):** +```bash +auto-round "your_model_path" --eval --device_map 0,1 --tasks lambada_openai --eval_backend vllm +``` + +**vLLM Backend (Option 2 - manual configuration):** +```bash +CUDA_VISIBLE_DEVICES=0,1 auto-round "your_model_path" --eval --tasks lambada_openai --eval_backend vllm --vllm_args="tensor_parallel_size=2,gpu_memory_utilization=0.8" +``` + +### Important Notes + +- Use the `--eval` flag to evaluate models directly. This supports both original and quantized models. +- The `--eval_task_by_task` option helps handle task failures by evaluating tasks sequentially. This only applies to the HF backend. +- When multiple formats are exported, the last format in the list will be used for evaluation. +- For vLLM backend, you can use `--device 0,1,2` to specify GPU devices. This will automatically set `CUDA_VISIBLE_DEVICES` and configure `tensor_parallel_size` based on the number of devices. Alternatively, you can manually set these via environment variables and `--vllm_args`. ## 6 Known Issues diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 361f1bdf9..c54a57bd3 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -3,9 +3,9 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from ..helpers import get_model_path, model_infer diff --git a/test/test_cpu/advanced/test_evaluation_functions.py b/test/test_cpu/advanced/test_evaluation_functions.py new file mode 100644 index 000000000..9cc99311d --- /dev/null +++ b/test/test_cpu/advanced/test_evaluation_functions.py @@ -0,0 +1,121 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +CPU tests for evaluation utility functions. +Lightweight tests focusing on key utility functions without heavy model loading. + +Run with: pytest test/test_cpu/advanced/test_evaluation_functions.py +""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + + +class TestParseVllmArgs: + """Test parse_vllm_args function for parsing custom vllm arguments.""" + + def test_parse_vllm_args_empty(self): + """Test parsing empty vllm_args.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args(None) + assert result == {} + + result = parse_vllm_args("") + assert result == {} + + def test_parse_vllm_args_integers(self): + """Test parsing integer arguments.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args("--tensor_parallel_size=2,--max_model_len=4096") + assert result == {"tensor_parallel_size": 2, "max_model_len": 4096} + assert isinstance(result["tensor_parallel_size"], int) + assert isinstance(result["max_model_len"], int) + + def test_parse_vllm_args_floats(self): + """Test parsing float arguments.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args("--gpu_memory_utilization=0.9,--swap_space=4.5") + assert result == {"gpu_memory_utilization": 0.9, "swap_space": 4.5} + assert isinstance(result["gpu_memory_utilization"], float) + assert isinstance(result["swap_space"], float) + + def test_parse_vllm_args_booleans(self): + """Test parsing boolean arguments.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args("--trust_remote_code=true,--enable_lora=false") + assert result == {"trust_remote_code": True, "enable_lora": False} + assert isinstance(result["trust_remote_code"], bool) + assert isinstance(result["enable_lora"], bool) + + def test_parse_vllm_args_strings(self): + """Test parsing string arguments.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args("--tokenizer_mode=auto,--quantization=awq") + assert result == {"tokenizer_mode": "auto", "quantization": "awq"} + assert isinstance(result["tokenizer_mode"], str) + assert isinstance(result["quantization"], str) + + def test_parse_vllm_args_mixed_types(self): + """Test parsing mixed type arguments.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args( + "--tensor_parallel_size=2,--gpu_memory_utilization=0.9,--trust_remote_code=true,--tokenizer_mode=auto" + ) + assert result == { + "tensor_parallel_size": 2, + "gpu_memory_utilization": 0.9, + "trust_remote_code": True, + "tokenizer_mode": "auto", + } + + def test_parse_vllm_args_without_double_dash(self): + """Test parsing arguments without leading '--'.""" + from auto_round.eval.eval_cli import parse_vllm_args + + result = parse_vllm_args("tensor_parallel_size=2,max_model_len=4096") + assert result == {"tensor_parallel_size": 2, "max_model_len": 4096} + + +class TestLoadGgufModelIfNeeded: + """Test _load_gguf_model_if_needed function for GGUF model detection and loading.""" + + def test_load_gguf_model_non_gguf_string_path(self): + """Test with non-GGUF model path (string).""" + from auto_round.eval.eval_cli import _load_gguf_model_if_needed + + model_path = "/path/to/regular/model" + model, tokenizer, is_gguf, gguf_file = _load_gguf_model_if_needed(model_path) + + assert model == model_path + assert tokenizer is None + assert is_gguf is False + assert gguf_file is None + + def test_load_gguf_model_non_string_model(self, tiny_opt_model_path): + """Test with model object (not a string path).""" + from auto_round.eval.eval_cli import _load_gguf_model_if_needed + + model, tokenizer, is_gguf, gguf_file = _load_gguf_model_if_needed(tiny_opt_model_path) + assert tokenizer is None + assert is_gguf is False + assert gguf_file is None diff --git a/test/test_cpu/backends/test_torch_backend.py b/test/test_cpu/backends/test_torch_backend.py index 5c70f7e99..ce4bc0049 100644 --- a/test/test_cpu/backends/test_torch_backend.py +++ b/test/test_cpu/backends/test_torch_backend.py @@ -2,9 +2,9 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel diff --git a/test/test_cpu/schemes/test_auto_scheme.py b/test/test_cpu/schemes/test_auto_scheme.py index 9d549076f..9bd362bf3 100644 --- a/test/test_cpu/schemes/test_auto_scheme.py +++ b/test/test_cpu/schemes/test_auto_scheme.py @@ -1,8 +1,9 @@ import shutil import pytest +from transformers import AutoRoundConfig -from auto_round import AutoRound, AutoRoundConfig, AutoScheme +from auto_round import AutoRound, AutoScheme class TestAutoScheme: diff --git a/test/test_cuda/advanced/test_evaluation.py b/test/test_cuda/advanced/test_evaluation.py new file mode 100644 index 000000000..8f67633ab --- /dev/null +++ b/test/test_cuda/advanced/test_evaluation.py @@ -0,0 +1,106 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GPU tests for VLLM evaluation functionality. +Tests the eval_with_vllm function and custom vllm_args parameter parsing. +Validates accuracy thresholds for quantized models. + +Run with: pytest test/test_cuda/advanced/test_evaluation.py -v +""" + +import os +import sys + +import pytest + +from ...helpers import opt_name_or_path + +# Test models for vllm evaluation +VLLM_EVAL_MODELS = [ + "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc", # auto_round:auto_gptq format +] + + +@pytest.mark.skipif( + not os.path.exists("/usr/bin/nvidia-smi") and not os.path.exists("/usr/local/cuda"), reason="CUDA not available" +) +class TestVllmEvaluation: + """Test VLLM backend evaluation functionality.""" + + @pytest.mark.parametrize("model", VLLM_EVAL_MODELS) + def test_vllm_backend_with_custom_args(self, model): + """Test vllm backend evaluation with custom vllm_args parameter.""" + python_path = sys.executable + + os.environ["VLLM_SKIP_WARMUP"] = "true" + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + # Test with custom vllm_args + cmd = f"{python_path} -m auto_round --model {model} --eval --tasks lambada_openai --eval_bs 128 --eval_backend vllm --limit 100 --vllm_args tensor_parallel_size=1,gpu_memory_utilization=0.6,max_model_len=2048" + + ret = os.system(cmd) + + assert ret == 0, f"vllm evaluation with custom args failed (rc={ret})" + + def test_vllm_backend_with_quantization_iters_0(self): + """Test vllm evaluation with iters=0 (quantization without fine-tuning).""" + python_path = sys.executable + + os.environ["VLLM_SKIP_WARMUP"] = "true" + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + cmd = f"{python_path} -m auto_round --model {opt_name_or_path} --iters 0 --disable_opt_rtn --tasks lambada_openai --eval_bs 8 --eval_backend vllm --limit 100" + + ret = os.system(cmd) + + assert ret == 0, f"vllm evaluation with iters=0 failed (rc={ret})" + + +@pytest.mark.skipif( + not os.path.exists("/usr/bin/nvidia-smi") and not os.path.exists("/usr/local/cuda"), reason="CUDA not available" +) +class TestHFEvaluation: + """Test different evaluation modes: --eval and --eval_backend.""" + + @pytest.mark.parametrize("model", VLLM_EVAL_MODELS) + def test_eval_mode_hf_backend(self, model): + """Test --eval flag: evaluate model without quantization (HF backend default).""" + python_path = sys.executable + + cmd = f"{python_path} -m auto_round --model {model} --eval --tasks lambada_openai --limit 100" + + ret = os.system(cmd) + + assert ret == 0, f"HF backend evaluation failed (rc={ret})" + + def test_iters_0_hf_backend(self, tiny_opt_model_path): + """Test quantization with iters=0 and HF backend evaluation.""" + python_path = sys.executable + + cmd = f"{python_path} -m auto_round --model {tiny_opt_model_path} --iters 0 --disable_opt_rtn --tasks lambada_openai --limit 10" + + ret = os.system(cmd) + + assert ret == 0, f"HF backend with iters=0 failed (rc={ret})" + + def test_iters_0_task_by_task(self, tiny_opt_model_path): + """Test quantization with iters=0 and task-by-task evaluation.""" + python_path = sys.executable + + cmd = f"{python_path} -m auto_round --model {tiny_opt_model_path} --iters 0 --disable_opt_rtn --eval_task_by_task --tasks lambada_openai,piqa --limit 10" + + ret = os.system(cmd) + + assert ret == 0, f"Task-by-task with iters=0 failed (rc={ret})" diff --git a/test/test_cuda/advanced/test_multiple_card.py b/test/test_cuda/advanced/test_multiple_card.py index 00863c2eb..a8c008a73 100644 --- a/test/test_cuda/advanced/test_multiple_card.py +++ b/test/test_cuda/advanced/test_multiple_card.py @@ -297,7 +297,7 @@ def test_device_map_for_triton(self): device_map1["model.norm"] = "cuda" device_map1["model.rotary_emb"] = "cuda" device_map1["model.embed_tokens"] = "cuda" - from auto_round import AutoRoundConfig + from transformers import AutoRoundConfig quantization_config = AutoRoundConfig(backend="tritonv2") diff --git a/test/test_cuda/backends/test_exllamav2_backend.py b/test/test_cuda/backends/test_exllamav2_backend.py index 8d20af99d..d31ac4a2f 100644 --- a/test/test_cuda/backends/test_exllamav2_backend.py +++ b/test/test_cuda/backends/test_exllamav2_backend.py @@ -2,9 +2,11 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import ( + AutoRound, +) from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut diff --git a/test/test_cuda/backends/test_marlin_backend.py b/test/test_cuda/backends/test_marlin_backend.py index 793fe3bca..0fcce5e22 100644 --- a/test/test_cuda/backends/test_marlin_backend.py +++ b/test/test_cuda/backends/test_marlin_backend.py @@ -2,9 +2,9 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from ...helpers import get_model_path, model_infer diff --git a/test/test_cuda/backends/test_torch_backend.py b/test/test_cuda/backends/test_torch_backend.py index 4594667d9..28df641a3 100644 --- a/test/test_cuda/backends/test_torch_backend.py +++ b/test/test_cuda/backends/test_torch_backend.py @@ -2,9 +2,9 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel diff --git a/test/test_cuda/backends/test_triton_backend.py b/test/test_cuda/backends/test_triton_backend.py index f51e8aeba..fb67ad049 100644 --- a/test/test_cuda/backends/test_triton_backend.py +++ b/test/test_cuda/backends/test_triton_backend.py @@ -2,9 +2,9 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_greater_than_050 diff --git a/test/test_cuda/export/test_auto_round_format.py b/test/test_cuda/export/test_auto_round_format.py index a2753605b..945a3d653 100644 --- a/test/test_cuda/export/test_auto_round_format.py +++ b/test/test_cuda/export/test_auto_round_format.py @@ -4,9 +4,9 @@ import pytest import torch import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import ( require_autogptq, diff --git a/test/test_cuda/export/test_export.py b/test/test_cuda/export/test_export.py index efd519a51..acbe19482 100644 --- a/test/test_cuda/export/test_export.py +++ b/test/test_cuda/export/test_export.py @@ -124,7 +124,7 @@ def test_autogptq_format_qsave_ignore_layers(self, dataloader): inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) - from auto_round import AutoRoundConfig + from transformers import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", trust_remote_code=True, quantization_config=AutoRoundConfig() @@ -219,7 +219,7 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader): ) quantized_model_path = "./saved/test_export" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") - from auto_round import AutoRoundConfig + from transformers import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained( quantized_model_path, device_map="auto", quantization_config=AutoRoundConfig() @@ -256,7 +256,7 @@ def test_autoround_3bit_asym_torch_format(self, tiny_opt_model_path, dataloader) autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round:gptqmodel") device = "auto" ##cpu, hpu, cuda - from auto_round import AutoRoundConfig + from transformers import AutoRoundConfig model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map=device) tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) @@ -282,7 +282,7 @@ def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader): autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") device = "auto" ##cpu, hpu, cuda - from auto_round import AutoRoundConfig + from transformers import AutoRoundConfig quantization_config = AutoRoundConfig(backend=device) model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/integrations/test_vllm.py b/test/test_cuda/integrations/test_vllm.py index a653ced16..686640573 100644 --- a/test/test_cuda/integrations/test_vllm.py +++ b/test/test_cuda/integrations/test_vllm.py @@ -7,10 +7,6 @@ Run `pytest test/test_cuda/test_vllm.py`. """ -import os -import shutil -import subprocess - import pytest from vllm import LLM, SamplingParams from vllm.platforms import current_platform @@ -36,7 +32,13 @@ def test_auto_round(model): sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. QUANTIZATION = "auto-round" - llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1) + llm = LLM( + model=model, + quantization=QUANTIZATION, + trust_remote_code=True, + tensor_parallel_size=1, + allow_deprecated_quantization=True, + ) # Generate texts from the prompts. # The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. @@ -47,31 +49,3 @@ def test_auto_round(model): generated_text = output.outputs[0].text if "France" in prompt: assert "Paris" in generated_text - - -@pytest.mark.parametrize("model", MODELS) -def test_vllm_lm_eval(model): - if shutil.which("auto-round") is None: - pytest.skip("auto-round CLI not available") - - env = os.environ.copy() - env["VLLM_SKIP_WARMUP"] = "true" - env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - - cmd = [ - "auto-round", - "--model", - model, - "--eval", - "--tasks", - "lambada_openai", - "--eval_bs", - "8", - "--eval_backend", - "vllm", - "--limit", - "10", - ] - - proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) - assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}" diff --git a/test/test_cuda/models/test_conv1d.py b/test/test_cuda/models/test_conv1d.py index 89b82a319..8c1f654a3 100644 --- a/test/test_cuda/models/test_conv1d.py +++ b/test/test_cuda/models/test_conv1d.py @@ -36,7 +36,7 @@ def test_quant(self, dataloader): model = get_tiny_model(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True - from auto_round import AutoRoundConfig + from transformers import AutoRoundConfig autoround = AutoRound( model, diff --git a/test/test_cuda/models/test_support_vlms.py b/test/test_cuda/models/test_support_vlms.py index a339ba69a..cbe0ca19b 100644 --- a/test/test_cuda/models/test_support_vlms.py +++ b/test/test_cuda/models/test_support_vlms.py @@ -5,8 +5,8 @@ import pytest import requests from PIL import Image +from transformers import AutoRoundConfig # # must import for auto-round format -from auto_round import AutoRoundConfig # # must import for auto-round format from auto_round.testing_utils import require_gptqmodel, require_package_version_ut, require_vlm_env AUTO_ROUND_PATH = __file__.split("/") diff --git a/test/test_cuda/models/test_vlms.py b/test/test_cuda/models/test_vlms.py index c8a4adb53..087102da0 100644 --- a/test/test_cuda/models/test_vlms.py +++ b/test/test_cuda/models/test_vlms.py @@ -6,8 +6,8 @@ import pytest import requests from PIL import Image +from transformers import AutoRoundConfig -from auto_round import AutoRoundConfig from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env @@ -23,7 +23,7 @@ def teardown_class(self): # def test_vision_generation(self): # quantized_model_path = "OPEA/Phi-3.5-vision-instruct-qvision-int4-sym-inc" - # from auto_round import AutoRoundConfig + # from transformers import AutoRoundConfig # device = "auto" ##cpu, hpu, cuda # quantization_config = AutoRoundConfig( # backend=device diff --git a/test/test_cuda/quantization/test_2_3bits.py b/test/test_cuda/quantization/test_2_3bits.py index 12ed75faa..c03d5e593 100644 --- a/test/test_cuda/quantization/test_2_3bits.py +++ b/test/test_cuda/quantization/test_2_3bits.py @@ -6,9 +6,9 @@ import torch import transformers from lm_eval.utils import make_table # pylint: disable=E0401 -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 diff --git a/test/test_cuda/schemes/test_auto_scheme.py b/test/test_cuda/schemes/test_auto_scheme.py index c7aafe8cc..b91c98428 100644 --- a/test/test_cuda/schemes/test_auto_scheme.py +++ b/test/test_cuda/schemes/test_auto_scheme.py @@ -4,8 +4,9 @@ import pytest import transformers +from transformers import AutoRoundConfig -from auto_round import AutoRound, AutoRoundConfig, AutoScheme +from auto_round import AutoRound, AutoScheme from auto_round.auto_scheme.utils import compute_avg_bits_for_model from auto_round.eval.evaluation import simple_evaluate from auto_round.testing_utils import multi_card diff --git a/test/test_cuda/utils/test_alg_ext.py b/test/test_cuda/utils/test_alg_ext.py index 16f42bda5..a29bffdac 100644 --- a/test/test_cuda/utils/test_alg_ext.py +++ b/test/test_cuda/utils/test_alg_ext.py @@ -3,9 +3,9 @@ import pytest import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from ...helpers import get_model_path diff --git a/test/test_cuda/utils/test_customized_data.py b/test/test_cuda/utils/test_customized_data.py index d2264ae56..e9ce2207a 100644 --- a/test/test_cuda/utils/test_customized_data.py +++ b/test/test_cuda/utils/test_customized_data.py @@ -5,9 +5,9 @@ import unittest sys.path.insert(0, "../..") -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound class TestCustomizedData(unittest.TestCase): diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 6c9d65c83..fb0dbe1ae 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -4,9 +4,9 @@ import pytest import torch import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from auto_round import AutoRound, AutoRoundConfig +from auto_round import AutoRound from ..helpers import get_model_path