From 811b547c13d276d29f329dff3466e5b3984e3912 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 11 Nov 2025 15:21:53 +0800
Subject: [PATCH 1/5] add pre-task lm_eval args for exprimental usage

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/__main__.py      | 30 +++++++++++++
 auto_round/eval/eval_cli.py | 85 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 76a8f73d1..aea845b43 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -367,6 +367,25 @@ def __init__(self, *args, **kwargs):
             "Options: 'float16', 'bfloat16', 'float32'. "
             "Should match your hardware capabilities for best performance.",
         )
+        eval_args.add_argument(
+            "--task_configs",
+            type=str,
+            default=None,
+            help=(
+                "Optional per-task configuration in JSON or simplified format. "
+                "Example JSON: "
+                "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, "
+                " \"hellaswag\": {\"num_fewshot\": 10}}' "
+                "You can also provide a JSON file path like 'task_configs.json'."
+            ),
+        )
+        eval_args.add_argument(
+            "--disable_thinking",
+            action="store_true",
+            help=("wheather to disable thinking mode of chat_template."
+            ),
+        )
+        eval_args.add_argument("--max_length", default=None, type=int, help="Random seed for reproducibility.")
 
         ## ======================= MLLM =======================
         mllm_args = self.add_argument_group("Multimodal Large Language Model(MLLM) arguments")
@@ -735,6 +754,9 @@ def tune(args):
                 limit=args.limit,
                 batch_size=args.eval_bs,
                 eval_model_dtype=eval_model_dtype,
+                task_configs=args.task_configs,
+                disable_thinking=args.disable_thinking,
+                max_length=args.max_length,
             )
         else:
             if args.eval_bs is None or args.eval_bs == "auto":
@@ -763,11 +785,15 @@ def tune(args):
             eval_task_by_task(
                 eval_folder,
                 device=device_str,
+                tokenizer=tokenizer,
                 tasks=args.tasks,
                 batch_size=args.eval_bs,
                 limit=args.limit,
                 eval_model_dtype=eval_model_dtype,
                 mllm=autoround.mllm,  # pylint: disable=E1101
+                task_configs=args.task_configs,
+                disable_thinking=args.disable_thinking,
+                max_length=args.max_length,
             )
         else:
             from auto_round.eval.evaluation import simple_evaluate
@@ -821,6 +847,9 @@ def run_eval():
             batch_size=args.eval_bs,
             trust_remote_code=not args.disable_trust_remote_code,
             eval_model_dtype=args.eval_model_dtype,
+            task_configs=args.task_configs,
+            disable_thinking=args.disable_thinking,
+            max_length=args.max_length,
         )
     else:
         eval(args)
@@ -852,3 +881,4 @@ def run_fast():
 
 if __name__ == "__main__":
     run()
+
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 009b6458d..a25de3983 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -101,6 +101,25 @@ def __init__(self, *args, **kwargs):
             choices=["hf", "vllm"],
             help="Backend to use for model evaluation. Use hf backend for evaluation by default.",
         )
+        self.add_argument(
+            "--task_configs",
+            type=str,
+            default=None,
+            help=(
+                "Optional per-task configuration in JSON or simplified format. "
+                "Example JSON: "
+                "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, "
+                " \"hellaswag\": {\"num_fewshot\": 10}}' "
+                "You can also provide a JSON file path like 'task_configs.json'."
+            ),
+        )
+        self.add_argument(
+            "--disable_thinking",
+            action="store_true",
+            help=("wheather to disable thinking mode of chat_template."
+            ),
+        )
+        self.add_argument("--max_length", default=None, type=int, help="max generation length for eval")
 
         # vllm related arguments
         vllm_args = self.add_argument_group("vllm backend arguments")
@@ -221,7 +240,34 @@ def eval_task_by_task(
     eval_model_dtype=None,
     retry_times=3,
     mllm=False,
+    task_configs=None,  # e.g. {"gsm8k": {"apply_chat_template": True, "fewshot_as_multiturn": True}}
+    disable_thinking=False,
+    max_length=None, # default to algin with model's original setting
 ):
+    """
+    Evaluate each LM-eval task sequentially, with optional per-task overrides.
+
+    Args:
+        model (str | nn.Module): Model path or loaded model.
+        device (str): Device id (e.g. "0" or "cuda:0").
+        tasks (list[str] | str): Tasks to run, separated by comma.
+        tokenizer: HuggingFace tokenizer.
+        batch_size: Eval batch size (default: "auto:8").
+        limit: Number of samples or fraction per task.
+        task_configs (dict): Optional task-specific settings like fewshot/chat.
+    """
+    if isinstance(task_configs, str):
+        if os.path.isfile(task_configs):
+            with open(task_configs, "r") as f:
+                task_configs = json.load(f)
+        else:
+            try:
+                task_configs = json.loads(task_configs)
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Invalid --task_configs format: {e}")
+    elif task_configs is None:
+        task_configs = {}
+
     set_cuda_visible_devices(device)
     device_str, parallelism = get_device_and_parallelism(device)
 
@@ -237,6 +283,10 @@ def eval_task_by_task(
 
     if batch_size is None:
         batch_size = "auto:8"
+
+    # -------------------------------
+    # Load model (support gguf)
+    # -------------------------------
     is_gguf_file = False
     if not isinstance(model, str):
         parallelism = False
@@ -265,6 +315,21 @@ def eval_task_by_task(
         )
         model.eval()
         parallelism = False
+    
+    # -------------------------------
+    # Build LM-eval model wrapper
+    # -------------------------------
+    if disable_thinking: ## align with fp-quant
+        from functools import partial
+        tokenizer.apply_chat_template = partial(
+            tokenizer.apply_chat_template, 
+            enable_thinking=False
+        )
+    # check the max_lentgh
+    init_kwargs = {}
+    if max_length is not None:
+        init_kwargs["max_length"] = max_length
+
     if mllm:
         if batch_size is None or batch_size == "auto":
             logger.warning("hf-multimodal models does not support auto currently, reset eval_bs to 16")
@@ -278,6 +343,7 @@ def eval_task_by_task(
             parallelize=parallelism,
             trust_remote_code=trust_remote_code,
             dtype=eval_model_dtype,
+            **init_kwargs,
         )
     else:
         hflm = HFLM(
@@ -289,6 +355,7 @@ def eval_task_by_task(
             parallelize=parallelism,
             trust_remote_code=trust_remote_code,
             dtype=eval_model_dtype,
+            **init_kwargs,
         )
 
     if isinstance(tasks, str):
@@ -302,10 +369,21 @@ def eval_task_by_task(
 
     st = time.time()
     for task in tasks:
+        task_cfg = task_configs.get(task, {})
+        num_fewshot = task_cfg.get("num_fewshot")
+        apply_chat_template = task_cfg.get("apply_chat_template", False)
+        batch_size = task_cfg.get("batch_size", batch_size)
+        fewshot_as_multiturn = task_cfg.get("fewshot_as_multiturn", False)
+        logger.info(f"=== Running task: {task} ===")
+        logger.info(f"Task config: fewshot={num_fewshot}, apply_chat_template={apply_chat_template}," \
+                 f"fewshot_as_multiturn={fewshot_as_multiturn}, batch_size={batch_size}")
         while retry_times:
             try:
                 res = lm_simple_evaluate(
-                    model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit
+                    model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit,
+                    num_fewshot=num_fewshot,
+                    apply_chat_template=apply_chat_template,
+                    fewshot_as_multiturn=fewshot_as_multiturn,
                 )
                 break
             except Exception as e:
@@ -317,7 +395,10 @@ def eval_task_by_task(
                             hflm.batch_sizes[k] = max(v // 2, 1)
                         logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.")
                         res = lm_simple_evaluate(
-                            model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit
+                            model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit,
+                            num_fewshot=num_fewshot,
+                            apply_chat_template=apply_chat_template,
+                            fewshot_as_multiturn=fewshot_as_multiturn,
                         )
                         hflm.batch_sizes = ori_batch_sizes
                     except Exception as e:

From 27747bdd410608262192d5ef91a5a7193f4f13c8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 11 Nov 2025 07:23:15 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/__main__.py      |  8 +++-----
 auto_round/eval/eval_cli.py | 41 ++++++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index aea845b43..7a2da1f79 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -374,16 +374,15 @@ def __init__(self, *args, **kwargs):
             help=(
                 "Optional per-task configuration in JSON or simplified format. "
                 "Example JSON: "
-                "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, "
-                " \"hellaswag\": {\"num_fewshot\": 10}}' "
+                '\'{"gsm8k_llama": {"apply_chat_template": true, "fewshot_as_multiturn": true}, '
+                ' "hellaswag": {"num_fewshot": 10}}\' '
                 "You can also provide a JSON file path like 'task_configs.json'."
             ),
         )
         eval_args.add_argument(
             "--disable_thinking",
             action="store_true",
-            help=("wheather to disable thinking mode of chat_template."
-            ),
+            help=("wheather to disable thinking mode of chat_template."),
         )
         eval_args.add_argument("--max_length", default=None, type=int, help="Random seed for reproducibility.")
 
@@ -881,4 +880,3 @@ def run_fast():
 
 if __name__ == "__main__":
     run()
-
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index a25de3983..38f14a882 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -108,16 +108,15 @@ def __init__(self, *args, **kwargs):
             help=(
                 "Optional per-task configuration in JSON or simplified format. "
                 "Example JSON: "
-                "'{\"gsm8k_llama\": {\"apply_chat_template\": true, \"fewshot_as_multiturn\": true}, "
-                " \"hellaswag\": {\"num_fewshot\": 10}}' "
+                '\'{"gsm8k_llama": {"apply_chat_template": true, "fewshot_as_multiturn": true}, '
+                ' "hellaswag": {"num_fewshot": 10}}\' '
                 "You can also provide a JSON file path like 'task_configs.json'."
             ),
         )
         self.add_argument(
             "--disable_thinking",
             action="store_true",
-            help=("wheather to disable thinking mode of chat_template."
-            ),
+            help=("wheather to disable thinking mode of chat_template."),
         )
         self.add_argument("--max_length", default=None, type=int, help="max generation length for eval")
 
@@ -242,7 +241,7 @@ def eval_task_by_task(
     mllm=False,
     task_configs=None,  # e.g. {"gsm8k": {"apply_chat_template": True, "fewshot_as_multiturn": True}}
     disable_thinking=False,
-    max_length=None, # default to algin with model's original setting
+    max_length=None,  # default to align with model's original setting
 ):
     """
     Evaluate each LM-eval task sequentially, with optional per-task overrides.
@@ -315,17 +314,15 @@ def eval_task_by_task(
         )
         model.eval()
         parallelism = False
-    
+
     # -------------------------------
     # Build LM-eval model wrapper
     # -------------------------------
-    if disable_thinking: ## align with fp-quant
+    if disable_thinking:  ## align with fp-quant
         from functools import partial
-        tokenizer.apply_chat_template = partial(
-            tokenizer.apply_chat_template, 
-            enable_thinking=False
-        )
-    # check the max_lentgh
+
+        tokenizer.apply_chat_template = partial(tokenizer.apply_chat_template, enable_thinking=False)
+    # check the max_length
     init_kwargs = {}
     if max_length is not None:
         init_kwargs["max_length"] = max_length
@@ -375,12 +372,19 @@ def eval_task_by_task(
         batch_size = task_cfg.get("batch_size", batch_size)
         fewshot_as_multiturn = task_cfg.get("fewshot_as_multiturn", False)
         logger.info(f"=== Running task: {task} ===")
-        logger.info(f"Task config: fewshot={num_fewshot}, apply_chat_template={apply_chat_template}," \
-                 f"fewshot_as_multiturn={fewshot_as_multiturn}, batch_size={batch_size}")
+        logger.info(
+            f"Task config: fewshot={num_fewshot}, apply_chat_template={apply_chat_template},"
+            f"fewshot_as_multiturn={fewshot_as_multiturn}, batch_size={batch_size}"
+        )
         while retry_times:
             try:
                 res = lm_simple_evaluate(
-                    model=hflm, model_args=None, device=device_str, tasks=task, batch_size=batch_size, limit=limit,
+                    model=hflm,
+                    model_args=None,
+                    device=device_str,
+                    tasks=task,
+                    batch_size=batch_size,
+                    limit=limit,
                     num_fewshot=num_fewshot,
                     apply_chat_template=apply_chat_template,
                     fewshot_as_multiturn=fewshot_as_multiturn,
@@ -395,7 +399,12 @@ def eval_task_by_task(
                             hflm.batch_sizes[k] = max(v // 2, 1)
                         logger.warning(f"Out of memory, reset batch_size to {hflm.batch_sizes} and re-try.")
                         res = lm_simple_evaluate(
-                            model=hflm, model_args=None, device=device_str, tasks=task, batch_size=1, limit=limit,
+                            model=hflm,
+                            model_args=None,
+                            device=device_str,
+                            tasks=task,
+                            batch_size=1,
+                            limit=limit,
                             num_fewshot=num_fewshot,
                             apply_chat_template=apply_chat_template,
                             fewshot_as_multiturn=fewshot_as_multiturn,

From 4a7050768d9f039e38af48ffa454edcbe074bd8a Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 12 Nov 2025 15:24:11 +0800
Subject: [PATCH 3/5] fix typo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/eval/eval_cli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 38f14a882..7d6f8a1cb 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -14,6 +14,7 @@
 import argparse
 import os
 import time
+import json
 
 from auto_round.utils import (
     clear_memory,

From 5a8bf1e74d2617dcf657ec36fc631e9b40c769f9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 12 Nov 2025 07:24:56 +0000
Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/eval/eval_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 7d6f8a1cb..c84f14e16 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
+import json
 import os
 import time
-import json
 
 from auto_round.utils import (
     clear_memory,

From 795e76689365017207cdff4077f471e7d161ee29 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Wed, 12 Nov 2025 15:28:09 +0800
Subject: [PATCH 5/5] fix plint typo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 auto_round/__main__.py      | 2 +-
 auto_round/eval/eval_cli.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index 7a2da1f79..97b34864c 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -382,7 +382,7 @@ def __init__(self, *args, **kwargs):
         eval_args.add_argument(
             "--disable_thinking",
             action="store_true",
-            help=("wheather to disable thinking mode of chat_template."),
+            help=("whether to disable thinking mode of chat_template."),
         )
         eval_args.add_argument("--max_length", default=None, type=int, help="Random seed for reproducibility.")
 
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
index 7d6f8a1cb..4bfdd304a 100644
--- a/auto_round/eval/eval_cli.py
+++ b/auto_round/eval/eval_cli.py
@@ -117,7 +117,7 @@ def __init__(self, *args, **kwargs):
         self.add_argument(
             "--disable_thinking",
             action="store_true",
-            help=("wheather to disable thinking mode of chat_template."),
+            help=("whether to disable thinking mode of chat_template."),
         )
         self.add_argument("--max_length", default=None, type=int, help="max generation length for eval")