|
17 | 17 |
|
18 | 18 | from auto_round.auto_scheme import AutoScheme |
19 | 19 | from auto_round.compressors import BaseCompressor |
20 | | -from auto_round.eval.eval_cli import EvalArgumentParser, _eval_init, eval, eval_task_by_task |
| 20 | +from auto_round.eval.eval_cli import EvalArgumentParser, eval, eval_task_by_task |
| 21 | +from auto_round.eval.evaluation import run_model_evaluation |
21 | 22 | from auto_round.schemes import PRESET_SCHEMES |
22 | 23 | from auto_round.utils import ( |
23 | 24 | clear_memory, |
@@ -383,6 +384,20 @@ def __init__(self, *args, **kwargs): |
383 | 384 | eval_args.add_argument( |
384 | 385 | "--eval_task_by_task", action="store_true", help="Evaluate tasks sequentially instead of batching. " |
385 | 386 | ) |
| 387 | + eval_args.add_argument( |
| 388 | + "--eval_backend", |
| 389 | + default="hf", |
| 390 | + type=str, |
| 391 | + choices=["hf", "vllm"], |
| 392 | + help="Backend to use for model evaluation. Use hf backend for evaluation by default.", |
| 393 | + ) |
| 394 | + eval_args.add_argument( |
| 395 | + "--vllm_args", |
| 396 | + default=None, |
| 397 | + type=str, |
| 398 | + help="(for vllm) Custom vllm arguments in format: '--arg1=value1,--arg2=value2'. " |
| 399 | + "Example: '--tensor_parallel_size=2,--gpu_memory_utilization=0.9'", |
| 400 | + ) |
386 | 401 | eval_args.add_argument( |
387 | 402 | "--eval_model_dtype", |
388 | 403 | default=None, |
@@ -703,185 +718,15 @@ def tune(args): |
703 | 718 | suffix = f"g{autoround.group_size}" |
704 | 719 | export_dir = os.path.join(args.output_dir, model_name.split("/")[-1] + f"-w{autoround.bits}{suffix}") |
705 | 720 |
|
| 721 | + # ======================= Quantize and save model ======================= |
706 | 722 | model, folders = autoround.quantize_and_save(export_dir, format=args.format) # pylint: disable=E1101 |
707 | 723 | tokenizer = autoround.tokenizer # pylint: disable=E1101 |
708 | 724 |
|
709 | 725 | model.eval() |
710 | 726 | clear_memory() |
711 | 727 |
|
712 | | - eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") |
713 | | - |
714 | | - # diffusion model has different evaluation path |
715 | | - if getattr(autoround, "diffusion", False): |
716 | | - pipe = autoround.pipe |
717 | | - pipe.to(model.dtype) |
718 | | - pipe.transformer = model |
719 | | - device_str = detect_device(device_str) |
720 | | - pipe = pipe.to(device_str) |
721 | | - if pipe.dtype != eval_model_dtype and eval_model_dtype != "auto": |
722 | | - pipe.to(getattr(torch, eval_model_dtype)) |
723 | | - |
724 | | - gen_kwargs = { |
725 | | - "guidance_scale": args.guidance_scale, |
726 | | - "output_type": "pil", |
727 | | - "num_inference_steps": args.num_inference_steps, |
728 | | - "generator": ( |
729 | | - None |
730 | | - if args.generator_seed is None |
731 | | - else torch.Generator(device=pipe.device).manual_seed(args.generator_seed) |
732 | | - ), |
733 | | - } |
734 | | - if not os.path.exists(args.image_save_dir): |
735 | | - os.makedirs(args.image_save_dir) |
736 | | - |
737 | | - if args.prompt is not None: |
738 | | - outputs = pipe(prompt=args.prompt, **gen_kwargs) |
739 | | - outputs.images[0].save(os.path.join(args.image_save_dir, "img.png")) |
740 | | - logger.info( |
741 | | - f"Image generated with prompt {args.prompt} is saved as {os.path.join(args.image_save_dir, 'img.png')}" |
742 | | - ) |
743 | | - |
744 | | - if args.prompt_file is not None: |
745 | | - from auto_round.compressors.diffusion import diffusion_eval |
746 | | - |
747 | | - metrics = args.metrics.split(",") |
748 | | - diffusion_eval(pipe, args.prompt_file, metrics, args.image_save_dir, 1, gen_kwargs) |
749 | | - return |
750 | | - |
751 | | - lm_eval_version = get_library_version("lm-eval") |
752 | | - |
753 | | - eval_folder = folders[-1] |
754 | | - if args.tasks is None or args.tasks == "" or eval_folder is None: |
755 | | - return |
756 | | - |
757 | | - tasks = args.tasks |
758 | | - if isinstance(tasks, str): |
759 | | - tasks = tasks.split(",") |
760 | | - |
761 | | - from lm_eval.utils import make_table # pylint: disable=E0401 |
762 | | - |
763 | | - logger.info(f"Using lm-eval version {lm_eval_version}") |
764 | | - eval_gguf_model = False |
765 | | - for file in os.listdir(eval_folder): |
766 | | - if file.endswith("gguf"): |
767 | | - eval_gguf_model = True |
768 | | - break |
769 | | - |
770 | | - import time |
771 | | - |
772 | | - if "llama" in args.model.lower() and not args.add_bos_token: |
773 | | - logger.warning("set add_bos_token=True for llama model.") |
774 | | - args.add_bos_token = True |
775 | | - if (autoround.act_bits <= 8 and formats[-1] == "fake") or eval_gguf_model: |
776 | | - if eval_gguf_model: |
777 | | - # for file in os.listdir(eval_folder): |
778 | | - # gguf_file = file |
779 | | - gguf_file = None |
780 | | - gguf_format = None # Initialize gguf_format to None |
781 | | - # gguf folder only contains one file |
782 | | - for format in formats: |
783 | | - if format.startswith("gguf"): |
784 | | - gguf_format = format.split(":")[-1].upper() |
785 | | - if gguf_format is None: # Validate gguf_format after the loop |
786 | | - logger.error("No valid gguf format found in formats. Please check the input.") |
787 | | - sys.exit(-1) |
788 | | - for file in os.listdir(eval_folder): |
789 | | - if gguf_format in file: |
790 | | - gguf_file = file |
791 | | - |
792 | | - logger.warning("evaluating gguf model is an experimental feature, the accuracy may be not correct.") |
793 | | - if eval_model_dtype == "float32" or eval_model_dtype == "auto": |
794 | | - logger.warning( |
795 | | - "set '--eval_model_dtype bf16' can significantly speed up evaluation for gguf model," |
796 | | - " but may affect accuracy." |
797 | | - ) |
798 | | - if gguf_file is None: |
799 | | - logger.error("Cannot find correct gguf file for evaluation, please check.") |
800 | | - sys.exit(-1) |
801 | | - model = AutoModelForCausalLM.from_pretrained( |
802 | | - eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype |
803 | | - ) |
804 | | - model.eval() |
805 | | - tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file) |
806 | | - else: |
807 | | - if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1: |
808 | | - from accelerate.big_modeling import dispatch_model |
809 | | - |
810 | | - dispatch_model(model, model.hf_device_map) |
811 | | - else: |
812 | | - device_str = detect_device(device_str) |
813 | | - model = model.to(device_str) |
814 | | - if model.dtype != eval_model_dtype and eval_model_dtype != "auto": |
815 | | - model.to(getattr(torch, eval_model_dtype)) |
816 | | - |
817 | | - if args.eval_task_by_task: |
818 | | - eval_task_by_task( |
819 | | - model, |
820 | | - tokenizer=tokenizer, |
821 | | - device=device_str, |
822 | | - tasks=args.tasks, |
823 | | - limit=args.limit, |
824 | | - batch_size=args.eval_bs, |
825 | | - eval_model_dtype=eval_model_dtype, |
826 | | - add_bos_token=args.add_bos_token, |
827 | | - ) |
828 | | - else: |
829 | | - if args.eval_bs is None or args.eval_bs == "auto": |
830 | | - logger.warning("This API does not support auto currently, reset eval_bs to 16") |
831 | | - args.eval_bs = 16 |
832 | | - from auto_round.eval.evaluation import simple_evaluate_user_model |
833 | | - |
834 | | - st = time.time() |
835 | | - |
836 | | - res = simple_evaluate_user_model( |
837 | | - model, |
838 | | - tokenizer, |
839 | | - tasks=tasks, |
840 | | - batch_size=args.eval_bs, |
841 | | - limit=args.limit, |
842 | | - device=device_str, |
843 | | - eval_model_dtype=eval_model_dtype, |
844 | | - add_bos_token=args.add_bos_token, |
845 | | - ) |
846 | | - print(make_table(res)) |
847 | | - print("evaluation running time=%ds" % (time.time() - st)) |
848 | | - else: |
849 | | - if args.eval_task_by_task: |
850 | | - eval_task_by_task( |
851 | | - eval_folder, |
852 | | - device=device_str, |
853 | | - tasks=args.tasks, |
854 | | - batch_size=args.eval_bs, |
855 | | - limit=args.limit, |
856 | | - eval_model_dtype=eval_model_dtype, |
857 | | - mllm=autoround.mllm, # pylint: disable=E1101 |
858 | | - add_bos_token=args.add_bos_token, |
859 | | - ) |
860 | | - else: |
861 | | - from auto_round.eval.evaluation import simple_evaluate |
862 | | - |
863 | | - tasks, model_args, device_str = _eval_init( |
864 | | - args.tasks, eval_folder, args.device_map, args.disable_trust_remote_code, dtype=eval_model_dtype |
865 | | - ) |
866 | | - st = time.time() |
867 | | - model_args += f",add_bos_token={args.add_bos_token}" |
868 | | - if autoround.mllm: # pylint: disable=E1101 |
869 | | - model_type = "hf-multimodal" |
870 | | - if args.eval_bs is None or args.eval_bs == "auto": |
871 | | - logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") |
872 | | - args.eval_bs = 16 |
873 | | - else: |
874 | | - model_type = "hf" |
875 | | - res = simple_evaluate( |
876 | | - model=model_type, |
877 | | - model_args=model_args, |
878 | | - tasks=tasks, |
879 | | - device=device_str, |
880 | | - batch_size=args.eval_bs, |
881 | | - limit=args.limit, |
882 | | - ) |
883 | | - print(make_table(res)) |
884 | | - print("evaluation running time=%ds" % (time.time() - st)) |
| 728 | + # ======================= Model evaluation ======================= |
| 729 | + run_model_evaluation(model, tokenizer, autoround, folders, formats, device_str, args) |
885 | 730 |
|
886 | 731 |
|
887 | 732 | def setup_eval_parser(): |
@@ -909,6 +754,7 @@ def run_eval(): |
909 | 754 | eval_task_by_task( |
910 | 755 | model=args.model, |
911 | 756 | device=args.device_map, |
| 757 | + limit=args.limit, |
912 | 758 | tasks=args.tasks, |
913 | 759 | batch_size=args.eval_bs, |
914 | 760 | trust_remote_code=not args.disable_trust_remote_code, |
|
0 commit comments