From 811b547c13d276d29f329dff3466e5b3984e3912 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Tue, 11 Nov 2025 15:21:53 +0800 Subject: [PATCH 1/5] add pre-task lm_eval args for exprimental usage Signed-off-by: Zhang, Weiwei1 --- auto_round/__main__.py | 30 +++++++++++++ auto_round/eval/eval_cli.py | 85 ++++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 76a8f73d1..aea845b43 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -367,6 +367,25 @@ def __init__(self, *args, **kwargs): "Options: 'float16', 'bfloat16', 'float32'. " "Should match your hardware capabilities for best performance.", ) + eval_args.add_argument( + "--task_configs", + type=str, + default=None, + help=( + "Optional per-task configuration in JSON or simplified format. " + "Example JSON: " + "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, " + " \"hellaswag\": {\"num_fewshot\": 10}}' " + "You can also provide a JSON file path like 'task_configs.json'." + ), + ) + eval_args.add_argument( + "--disable_thinking", + action="store_true", + help=("wheather to disable thinking mode of chat_template." + ), + ) + eval_args.add_argument("--max_length", default=None, type=int, help="Random seed for reproducibility.") ## ======================= MLLM ======================= mllm_args = self.add_argument_group("Multimodal Large Language Model(MLLM) arguments") @@ -735,6 +754,9 @@ def tune(args): limit=args.limit, batch_size=args.eval_bs, eval_model_dtype=eval_model_dtype, + task_configs=args.task_configs, + disable_thinking=args.disable_thinking, + max_length=args.max_length, ) else: if args.eval_bs is None or args.eval_bs == "auto": @@ -763,11 +785,15 @@ def tune(args): eval_task_by_task( eval_folder, device=device_str, + tokenizer=tokenizer, tasks=args.tasks, batch_size=args.eval_bs, limit=args.limit, eval_model_dtype=eval_model_dtype, mllm=autoround.mllm, # pylint: disable=E1101 + task_configs=args.task_configs, + disable_thinking=args.disable_thinking, + max_length=args.max_length, ) else: from auto_round.eval.evaluation import simple_evaluate @@ -821,6 +847,9 @@ def run_eval(): batch_size=args.eval_bs, trust_remote_code=not args.disable_trust_remote_code, eval_model_dtype=args.eval_model_dtype, + task_configs=args.task_configs, + disable_thinking=args.disable_thinking, + max_length=args.max_length, ) else: eval(args) @@ -852,3 +881,4 @@ def run_fast(): if __name__ == "__main__": run() + diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 009b6458d..a25de3983 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -101,6 +101,25 @@ def __init__(self, *args, **kwargs): choices=["hf", "vllm"], help="Backend to use for model evaluation. Use hf backend for evaluation by default.", ) + self.add_argument( + "--task_configs", + type=str, + default=None, + help=( + "Optional per-task configuration in JSON or simplified format. " + "Example JSON: " + "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, " + " \"hellaswag\": {\"num_fewshot\": 10}}' " + "You can also provide a JSON file path like 'task_configs.json'." + ), + ) + self.add_argument( + "--disable_thinking", + action="store_true", + help=("wheather to disable thinking mode of chat_template." + ), + ) + self.add_argument("--max_length", default=None, type=int, help="max generation length for eval") # vllm related arguments vllm_args = self.add_argument_group("vllm backend arguments") @@ -221,7 +240,34 @@ def eval_task_by_task( eval_model_dtype=None, retry_times=3, mllm=False, + task_configs=None, # e.g. {"gsm8k": {"apply_chat_template": True, "fewshot_as_multiturn": True}} + disable_thinking=False, + max_length=None, # default to algin with model's original setting ): + """ + Evaluate each LM-eval task sequentially, with optional per-task overrides. + + Args: + model (str | nn.Module): Model path or loaded model. + device (str): Device id (e.g. "0" or "cuda:0"). + tasks (list[str] | str): Tasks to run, separated by comma. + tokenizer: HuggingFace tokenizer. + batch_size: Eval batch size (default: "auto:8"). + limit: Number of samples or fraction per task. + task_configs (dict): Optional task-specific settings like fewshot/chat. + """ + if isinstance(task_configs, str): + if os.path.isfile(task_configs): + with open(task_configs, "r") as f: + task_configs = json.load(f) + else: + try: + task_configs = json.loads(task_configs) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid --task_configs format: {e}") + elif task_configs is None: + task_configs = {} + set_cuda_visible_devices(device) device_str, parallelism = get_device_and_parallelism(device) @@ -237,6 +283,10 @@ def eval_task_by_task( if batch_size is None: batch_size = "auto:8" + + # ------------------------------- + # Load model (support gguf) + # ------------------------------- is_gguf_file = False if not isinstance(model, str): parallelism = False @@ -265,6 +315,21 @@ def eval_task_by_task( ) model.eval() parallelism = False + + # ------------------------------- + # Build LM-eval model wrapper + # ------------------------------- + if disable_thinking: ## align with fp-quant + from functools import partial + tokenizer.apply_chat_template = partial( + tokenizer.apply_chat_template, + enable_thinking=False + ) + # check the max_lentgh + init_kwargs = {} + if max_length is not None: + init_kwargs["max_length"] = max_length + if mllm: if batch_size is None or batch_size == "auto": logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16") @@ -278,6 +343,7 @@ def eval_task_by_task( parallelize=parallelism, trust_remote_code=trust_remote_code, dtype=eval_model_dtype, + **init_kwargs, ) else: hflm = HFLM( @@ -289,6 +355,7 @@ def eval_task_by_task( parallelize=parallelism, trust_remote_code=trust_remote_code, dtype=eval_model_dtype, + **init_kwargs, ) if isinstance(tasks, str): @@ -302,10 +369,21 @@ def eval_task_by_task( st = time.time() for task in tasks: + task_cfg = task_configs.get(task, {}) + num_fewshot = task_cfg.get("num_fewshot") + apply_chat_template = task_cfg.get("apply_chat_template", False) + batch_size = task_cfg.get("batch_size", batch_size) + fewshot_as_multiturn = task_cfg.get("fewshot_as_multiturn", False) + logger.info(f"=== Running task: {task} ===") + logger.info(f"Task config: fewshot={num_fewshot}, apply_chat_template={apply_chat_template}," \ + f"fewshot_as_multiturn={fewshot_as_multiturn}, batch_size={batch_size}") while retry_times: try: res = lm_simple_evaluate( - model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit + model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit, + num_fewshot=num_fewshot, + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, ) break except Exception as e: @@ -317,7 +395,10 @@ def eval_task_by_task( hflm.batch_sizes[k] = max(v // 2, 1) logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.") res = lm_simple_evaluate( - model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit + model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit, + num_fewshot=num_fewshot, + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, ) hflm.batch_sizes = ori_batch_sizes except Exception as e: From 27747bdd410608262192d5ef91a5a7193f4f13c8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Nov 2025 07:23:15 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/__main__.py | 8 +++----- auto_round/eval/eval_cli.py | 41 ++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index aea845b43..7a2da1f79 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -374,16 +374,15 @@ def __init__(self, *args, **kwargs): help=( "Optional per-task configuration in JSON or simplified format. " "Example JSON: " - "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, " - " \"hellaswag\": {\"num_fewshot\": 10}}' " + '\'{"gsm8k_llama": {"apply_chat_template": true, "fewshot_as_multiturn": true}, ' + ' "hellaswag": {"num_fewshot": 10}}\' ' "You can also provide a JSON file path like 'task_configs.json'." ), ) eval_args.add_argument( "--disable_thinking", action="store_true", - help=("wheather to disable thinking mode of chat_template." - ), + help=("wheather to disable thinking mode of chat_template."), ) eval_args.add_argument("--max_length", default=None, type=int, help="Random seed for reproducibility.") @@ -881,4 +880,3 @@ def run_fast(): if __name__ == "__main__": run() - diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index a25de3983..38f14a882 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -108,16 +108,15 @@ def __init__(self, *args, **kwargs): help=( "Optional per-task configuration in JSON or simplified format. " "Example JSON: " - "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, " - " \"hellaswag\": {\"num_fewshot\": 10}}' " + '\'{"gsm8k_llama": {"apply_chat_template": true, "fewshot_as_multiturn": true}, ' + ' "hellaswag": {"num_fewshot": 10}}\' ' "You can also provide a JSON file path like 'task_configs.json'." ), ) self.add_argument( "--disable_thinking", action="store_true", - help=("wheather to disable thinking mode of chat_template." - ), + help=("wheather to disable thinking mode of chat_template."), ) self.add_argument("--max_length", default=None, type=int, help="max generation length for eval") @@ -242,7 +241,7 @@ def eval_task_by_task( mllm=False, task_configs=None, # e.g. {"gsm8k": {"apply_chat_template": True, "fewshot_as_multiturn": True}} disable_thinking=False, - max_length=None, # default to algin with model's original setting + max_length=None, # default to align with model's original setting ): """ Evaluate each LM-eval task sequentially, with optional per-task overrides. @@ -315,17 +314,15 @@ def eval_task_by_task( ) model.eval() parallelism = False - + # ------------------------------- # Build LM-eval model wrapper # ------------------------------- - if disable_thinking: ## align with fp-quant + if disable_thinking: ## align with fp-quant from functools import partial - tokenizer.apply_chat_template = partial( - tokenizer.apply_chat_template, - enable_thinking=False - ) - # check the max_lentgh + + tokenizer.apply_chat_template = partial(tokenizer.apply_chat_template, enable_thinking=False) + # check the max_length init_kwargs = {} if max_length is not None: init_kwargs["max_length"] = max_length @@ -375,12 +372,19 @@ def eval_task_by_task( batch_size = task_cfg.get("batch_size", batch_size) fewshot_as_multiturn = task_cfg.get("fewshot_as_multiturn", False) logger.info(f"=== Running task: {task} ===") - logger.info(f"Task config: fewshot={num_fewshot}, apply_chat_template={apply_chat_template}," \ - f"fewshot_as_multiturn={fewshot_as_multiturn}, batch_size={batch_size}") + logger.info( + f"Task config: fewshot={num_fewshot}, apply_chat_template={apply_chat_template}," + f"fewshot_as_multiturn={fewshot_as_multiturn}, batch_size={batch_size}" + ) while retry_times: try: res = lm_simple_evaluate( - model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit, + model=hflm, + model_args=None, + device=device_str, + tasks=task, + batch_size=batch_size, + limit=limit, num_fewshot=num_fewshot, apply_chat_template=apply_chat_template, fewshot_as_multiturn=fewshot_as_multiturn, @@ -395,7 +399,12 @@ def eval_task_by_task( hflm.batch_sizes[k] = max(v // 2, 1) logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.") res = lm_simple_evaluate( - model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit, + model=hflm, + model_args=None, + device=device_str, + tasks=task, + batch_size=1, + limit=limit, num_fewshot=num_fewshot, apply_chat_template=apply_chat_template, fewshot_as_multiturn=fewshot_as_multiturn, From 4a7050768d9f039e38af48ffa454edcbe074bd8a Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Wed, 12 Nov 2025 15:24:11 +0800 Subject: [PATCH 3/5] fix typo Signed-off-by: Zhang, Weiwei1 --- auto_round/eval/eval_cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 38f14a882..7d6f8a1cb 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -14,6 +14,7 @@ import argparse import os import time +import json from auto_round.utils import ( clear_memory, From 5a8bf1e74d2617dcf657ec36fc631e9b40c769f9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 07:24:56 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/eval/eval_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 7d6f8a1cb..c84f14e16 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import json import os import time -import json from auto_round.utils import ( clear_memory, From 795e76689365017207cdff4077f471e7d161ee29 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Wed, 12 Nov 2025 15:28:09 +0800 Subject: [PATCH 5/5] fix plint typo Signed-off-by: Zhang, Weiwei1 --- auto_round/__main__.py | 2 +- auto_round/eval/eval_cli.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 7a2da1f79..97b34864c 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -382,7 +382,7 @@ def __init__(self, *args, **kwargs): eval_args.add_argument( "--disable_thinking", action="store_true", - help=("wheather to disable thinking mode of chat_template."), + help=("whether to disable thinking mode of chat_template."), ) eval_args.add_argument("--max_length", default=None, type=int, help="Random seed for reproducibility.") diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py index 7d6f8a1cb..4bfdd304a 100644 --- a/auto_round/eval/eval_cli.py +++ b/auto_round/eval/eval_cli.py @@ -117,7 +117,7 @@ def __init__(self, *args, **kwargs): self.add_argument( "--disable_thinking", action="store_true", - help=("wheather to disable thinking mode of chat_template."), + help=("whether to disable thinking mode of chat_template."), ) self.add_argument("--max_length", default=None, type=int, help="max generation length for eval")