Fix CLI regressions (#3449)

fxmarty-amd · baberabb · web-flow · commit 35a6e862c41b · 2025-12-06T16:11:52.000+05:00
* fix regressions in cli

* linting?

* still allow space delimited

* use `SplitArgs` for --tasks; use nargs="+"; tests

---------

Co-authored-by: Baber &lt;baber@hey.com&gt;
diff --git a/lm_eval/_cli/run.py b/lm_eval/_cli/run.py
@@ -8,6 +8,7 @@
 from lm_eval._cli.subcommand import SubCommand
 from lm_eval._cli.utils import (
     MergeDictAction,
+    SplitArgs,
     _int_or_none_list_arg_type,
     request_caching_arg_to_dict,
     try_parse_json,
@@ -65,11 +66,11 @@ def _add_args(self) -> None:
             "--tasks",
             "-t",
             default=None,
-            type=str,
-            nargs="*",
+            nargs="+",
             metavar="<task>",
+            action=SplitArgs,
             help=textwrap.dedent("""
-                Space or Comma-separated list of task names or groupings.
+                Space (or comma-separated) list of task names or groupings.
                 Use 'lm-eval list tasks' to see all available tasks.
             """).strip(),
         )
@@ -85,7 +86,7 @@ def _add_args(self) -> None:
             "--model_args",
             "-a",
             default=None,
-            nargs="*",
+            nargs="+",
             action=MergeDictAction,
             metavar="<arg>",
             help="Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`",
@@ -153,7 +154,7 @@ def _add_args(self) -> None:
         eval_group.add_argument(
             "--gen_kwargs",
             default=None,
-            nargs="*",
+            nargs="+",
             action=MergeDictAction,
             metavar="<arg>",
             help=textwrap.dedent(
@@ -265,23 +266,23 @@ def _add_args(self) -> None:
         logging_group.add_argument(
             "--wandb_args",
             default=None,
-            nargs="*",
+            nargs="+",
             action=MergeDictAction,
             metavar="<args>",
             help="Weights & Biases init arguments key=val key2=val2",
         )
         logging_group.add_argument(
             "--wandb_config_args",
             default=None,
-            nargs="*",
+            nargs="+",
             action=MergeDictAction,
             metavar="<args>",
             help="Weights & Biases config arguments key=val key2=val2",
         )
         logging_group.add_argument(
             "--hf_hub_log_args",
             default=None,
-            nargs="*",
+            nargs="+",
             action=MergeDictAction,
             metavar="<args>",
             help="Hugging Face Hub logging arguments key=val key2=val2",
diff --git a/lm_eval/_cli/utils.py b/lm_eval/_cli/utils.py
@@ -21,15 +21,17 @@ def try_parse_json(value: str | dict[str, Any] | None) -> str | dict[str, Any] |
         if "{" in value:
             raise ValueError(
                 f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings."
-            )
+            ) from None
         return value
 
 
 def _int_or_none_list_arg_type(
     min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
 ) -> list[int | None]:
     """Parses a string of integers or 'None' values separated by a specified character into a list.
-    Validates the number of items against specified minimum and maximum lengths and fills missing values with defaults."""
+
+    Validates the number of items against specified minimum and maximum lengths and fills missing values with defaults.
+    """
 
     def parse_value(item):
         """Parses an individual item, converting it to an integer or `None`."""
@@ -39,7 +41,7 @@ def parse_value(item):
         try:
             return int(item)
         except ValueError:
-            raise ValueError(f"{item} is not an integer or None")
+            raise ValueError(f"{item} is not an integer or None") from None
 
     items = [parse_value(v) for v in value.split(split_char)]
     num_items = len(items)
@@ -109,6 +111,7 @@ def key_val_to_dict(args: str) -> dict[str, Any]:
     res = {}
     if not args:
         return res
+
     for k, v in (item.split("=") for item in args.split(",")):
         v = handle_cli_value_string(v)
         if k in res:
@@ -128,13 +131,34 @@ def __call__(
         option_string: str | None = None,
     ) -> None:
         current = vars(namespace).setdefault(self.dest, {}) or {}
-        if values:
-            for v in values:
-                v = key_val_to_dict(v)
-                if overlap := current.keys() & v.keys():
-                    eval_logger.warning(
-                        f"{option_string or self.dest}: Overwriting key {', '.join(f'{k}: {current[k]!r} -> {v[k]!r}' for k in overlap)}"
-                    )
-
-                current.update(v)
+
+        if not values:
+            return
+
+        # e.g. parses `{"pretrained":"/models/openai_gpt-oss-20b","dtype":"auto","chat_template_args":{"reasoning_effort":"low"},"enable_thinking": true,"think_end_token":"<|message|>"}`.
+        result = try_parse_json(values[0])
+
+        if isinstance(result, dict):
+            current = {**current, **result}
+        else:
+            # e.g. parses `max_gen_toks=8000`
+            if values:
+                for v in values:
+                    v = key_val_to_dict(v)
+                    if overlap := current.keys() & v.keys():
+                        eval_logger.warning(
+                            rf"{option_string or self.dest}: Overwriting {', '.join(f'{k}: {current[k]!r} -> {v[k]!r}' for k in overlap)}"
+                        )
+                    current.update(v)
+
         setattr(namespace, self.dest, current)
+
+
+class SplitArgs(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        items = getattr(namespace, self.dest) or []
+        values = values or []
+        assert values, f"--{self.dest} passed without any values"
+        for v in values:
+            items.extend(v.split(","))
+        setattr(namespace, self.dest, items)
diff --git a/lm_eval/config/evaluate_config.py b/lm_eval/config/evaluate_config.py
@@ -203,7 +203,7 @@ def from_cli(cls, namespace: Namespace) -> "EvaluatorConfig":
 
         # Load and merge YAML config if provided
         if used_config := getattr(namespace, "config", None):
-            config.update(cls.load_yaml_config(cast(str, used_config)))
+            config.update(cls.load_yaml_config(cast("str", used_config)))
 
         # Override with CLI args (only truthy values or 0, exclude non-config args)
         excluded_args = {"command", "func"}  # argparse internal args
@@ -320,7 +320,7 @@ def _process_arguments(self):
                 try:
                     self.samples = json.loads(self.samples)
                 except json.JSONDecodeError:
-                    if (samples_path := Path(cast(str, self.samples))).is_file():
+                    if (samples_path := Path(cast("str", self.samples))).is_file():
                         self.samples = json.loads(samples_path.read_text())
 
         # Set up metadata by merging model_args and metadata.
@@ -358,8 +358,11 @@ def process_tasks(self, metadata: dict | None = None) -> "TaskManager":
         )
 
         # Normalize tasks to a list
+        # We still allow tasks in the form task1,task2
         task_list = (
-            self.tasks.split(",") if isinstance(self.tasks, str) else list(self.tasks)
+            self.tasks.split(",")
+            if isinstance(self.tasks, str)
+            else [t for task in self.tasks for t in task.split(",")]
         )
 
         # Handle directory input
diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py
@@ -209,7 +209,7 @@ def simple_evaluate(
             "No tasks specified, or no tasks found. Please verify the task names."
         )
 
-    if gen_kwargs is not None:
+    if gen_kwargs:
         if isinstance(gen_kwargs, str):
             gen_kwargs = simple_parse_args_string(gen_kwargs)
         eval_logger.warning(
diff --git a/lm_eval/evaluator_utils.py b/lm_eval/evaluator_utils.py
@@ -3,7 +3,6 @@
 import math
 import pathlib
 import sys
-from typing import List, Optional, Tuple, Union
 
 from lm_eval.api.group import ConfigurableGroup
 from lm_eval.api.metrics import (
@@ -139,7 +138,7 @@ def __repr__(self):
         )
 
 
-def get_task_list(task_dict: dict) -> List[TaskOutput]:
+def get_task_list(task_dict: dict) -> list[TaskOutput]:
     outputs = []
     for task_name, task_obj in task_dict.items():
         if isinstance(task_obj, dict):
@@ -210,7 +209,7 @@ def print_writeout(task) -> None:
             eval_logger.info(f"Request: {str(inst)}")
 
 
-def get_sample_size(task, limit: Optional[int]) -> Union[int, None]:
+def get_sample_size(task, limit: int | None) -> int | None:
     if limit is not None:
         limit = (
             int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit)
@@ -223,7 +222,7 @@ def prepare_print_tasks(
     results: dict,
     task_depth=0,
     group_depth=0,
-) -> Tuple[dict, dict]:
+) -> tuple[dict, dict]:
     """
     @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
     value is a list of task names.
@@ -311,8 +310,8 @@ def _sort_task_dict(task_dict):
 
 
 def consolidate_results(
-    eval_tasks: List[TaskOutput],
-) -> Tuple[dict, dict, dict, dict, dict, dict]:
+    eval_tasks: list[TaskOutput],
+) -> tuple[dict, dict, dict, dict, dict, dict]:
     """
     @param eval_tasks: list(TaskOutput).
     @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
@@ -379,7 +378,7 @@ def consolidate_group_results(
     task_root=None,
     show_group_table=False,
     task_aggregation_list=None,
-) -> Tuple[dict, dict, bool, Union[None,]]:
+) -> tuple[dict, dict, bool, None]:
     """
     (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.
 
@@ -548,7 +547,7 @@ def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
 
 
 @positional_deprecated
-def run_task_tests(task_list: List[str]):
+def run_task_tests(task_list: list[str]):
     """
     Find the package root and run the tests for the given tasks
     """
diff --git a/tests/test_cli_subcommands.py b/tests/test_cli_subcommands.py

Original file line number	Diff line number	Diff line change
`@@ -209,7 +209,7 @@ def simple_evaluate(`
`209`	`209`	`"No tasks specified, or no tasks found. Please verify the task names."`
`210`	`210`	`)`
`211`	`211`
`212`		`- if gen_kwargs is not None:`
	`212`	`+ if gen_kwargs:`
`213`	`213`	`if isinstance(gen_kwargs, str):`
`214`	`214`	`gen_kwargs = simple_parse_args_string(gen_kwargs)`
`215`	`215`	`eval_logger.warning(`