huggingface · evalstate · Apr 22, 2026
diff --git a/skills/huggingface-llm-trainer/SKILL.md b/skills/huggingface-llm-trainer/SKILL.md
@@ -424,20 +424,18 @@ Before submitting:
 
 **Identify models to train based on task type or benchmark results.**
 
-Use `scripts/hf_benchmarks.py` to identify top-performing models for specific tasks. This helps the user select a model as the base for training, whilst keeping size and hardware constraints in mind.
+Use `scripts/hf_benchmarks.py` to help choose a base model while keeping task fit, model size, and hardware constraints in mind.
 
-```bash
-# Get help on the benchmarks command:
-uv run scripts/hf_benchmarks.py --help
-```
+Capabilities:
+- search official benchmark datasets by free text, alias, task, and modality
+- fetch normalized leaderboard rows for benchmark datasets
+- fetch normalized `evalResults` rows for one or more candidate models
+- work well in pipelines via stdin plus table / JSON / NDJSON output
 
-### Example -- choosing an OCR base model
-```bash
-# Search for benchmarks containing whose name contains the text `ocr`
-uv run scripts/hf_benchmarks.py search --query ocr
+For command details, examples, and flags, use:
 
-# Get the ranked leaderboard for the allenai/olmOCR-bench benchmark 
-uv run scripts/hf_benchmarks.py leaderboard allenai/olmOCR-bench
+```bash
+uv run scripts/hf_benchmarks.py --help
 ```
 
 ## Cost Estimation
@@ -710,7 +708,7 @@ Add to PEP 723 header:
 - `scripts/unsloth_sft_example.py` - Unsloth text LLM training template (faster, less VRAM)
 - `scripts/estimate_cost.py` - Estimate time and cost (offer when appropriate)
 - `scripts/convert_to_gguf.py` - Complete GGUF conversion script
-- `scripts/hf_benchmarks.py` - Search for benchmark results and leaderboards by task, alias or free text.
+- `scripts/hf_benchmarks.py` - Search benchmark datasets, fetch dataset leaderboards, and inspect model `evalResults`.
 
 ### External Scripts
 - [Dataset Inspector](https://huggingface.co/datasets/mcp-tools/skills/raw/main/dataset_inspector.py) - Validate dataset format before training (use via `uv run` or `hf_jobs`)

diff --git a/skills/huggingface-llm-trainer/scripts/hf_benchmarks.py b/skills/huggingface-llm-trainer/scripts/hf_benchmarks.py
@@ -96,6 +96,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self._search_parser: argparse.ArgumentParser | None = None
         self._leaderboard_parser: argparse.ArgumentParser | None = None
+        self._model_results_parser: argparse.ArgumentParser | None = None
 
     def format_help(self) -> str:
         text = super().format_help()
@@ -113,6 +114,12 @@ def format_help(self) -> str:
                 + textwrap.indent(self._leaderboard_parser.format_help().strip(), "  ")
             )
 
+        if self._model_results_parser is not None:
+            extra_sections.append(
+                "\nmodel-results command options:\n"
+                + textwrap.indent(self._model_results_parser.format_help().strip(), "  ")
+            )
+
         if extra_sections:
             text += "\n" + "\n".join(extra_sections) + "\n"
         return text
@@ -401,10 +408,47 @@ def get_leaderboard(repo_id: str, task_id: str | None = None) -> list[dict[str,
     return normalized
 
 
-def read_repo_ids_from_stdin() -> list[str]:
+def get_model_results(model_id: str) -> list[dict[str, Any]]:
+    namespace, repo = parse_repo_id(model_id)
+    data = http_get_json(
+        f"/api/models/{namespace}/{repo}",
+        params={"expand[]": "evalResults"},
+    )
+    if not isinstance(data, dict):
+        raise HfApiError(f"Unexpected model response for {model_id}")
+
+    eval_results = data.get("evalResults") or []
+    if not isinstance(eval_results, list):
+        raise HfApiError(f"Unexpected evalResults payload for {model_id}")
+
+    normalized: list[dict[str, Any]] = []
+    for row in eval_results:
+        payload = row.get("data") or {}
+        dataset = payload.get("dataset") or {}
+        source = payload.get("source") or {}
+        normalized.append(
+            {
+                "model_id": model_id,
+                "dataset_id": dataset.get("id"),
+                "task_id": dataset.get("task_id"),
+                "value": payload.get("value"),
+                "date": payload.get("date"),
+                "verified": row.get("verified"),
+                "filename": row.get("filename"),
+                "notes": payload.get("notes"),
+                "pull_request": row.get("pullRequest"),
+                "source_name": source.get("name"),
+                "source_url": source.get("url"),
+            }
+        )
+    return normalized
+
+
+def read_repo_ids_from_stdin(*, json_keys: Iterable[str]) -> list[str]:
     if sys.stdin.isatty():
         return []
 
+    key_list = list(json_keys)
     repo_ids: list[str] = []
     for raw_line in sys.stdin:
         line = raw_line.strip()
@@ -415,7 +459,12 @@ def read_repo_ids_from_stdin() -> list[str]:
                 obj = json.loads(line)
             except json.JSONDecodeError:
                 continue
-            candidate = obj.get("dataset_id") or obj.get("id")
+            candidate = None
+            for key in key_list:
+                value = obj.get(key)
+                if isinstance(value, str) and "/" in value:
+                    candidate = value
+                    break
             if isinstance(candidate, str) and "/" in candidate:
                 repo_ids.append(candidate)
             continue
@@ -476,6 +525,27 @@ def print_leaderboard_table(rows: list[dict[str, Any]]) -> None:
         print("  ".join(v.ljust(w) for v, w in zip(values, widths)))
 
 
+def print_model_results_table(rows: list[dict[str, Any]]) -> None:
+    if not rows:
+        print("No model eval rows returned.")
+        return
+
+    headers = ["model_id", "dataset_id", "task_id", "value", "date", "verified"]
+    widths = [34, 30, 22, 10, 12, 8]
+    print("  ".join(h.ljust(w) for h, w in zip(headers, widths)))
+    print("  ".join("-" * w for w in widths))
+    for row in rows:
+        values = [
+            shorten(str(row.get("model_id") or ""), widths[0]),
+            shorten(str(row.get("dataset_id") or ""), widths[1]),
+            shorten(str(row.get("task_id") or ""), widths[2]),
+            shorten(str(row.get("value") or ""), widths[3]),
+            shorten(str(row.get("date") or ""), widths[4]),
+            str(row.get("verified")),
+        ]
+        print("  ".join(v.ljust(w) for v, w in zip(values, widths)))
+
+
 def build_parser() -> argparse.ArgumentParser:
     parser = FullHelpArgumentParser(
         prog="hf_benchmarks.py",
@@ -496,6 +566,20 @@ def build_parser() -> argparse.ArgumentParser:
               3) Chain search -> leaderboard:
                    hf_benchmarks.py search --alias coding --format ndjson \\
                      | hf_benchmarks.py leaderboard --stdin --top 5 --format table
+
+              4) Fetch eval results for a list of models:
+                   printf '%s\\n' Qwen/Qwen3.5-9B microsoft/Phi-3-medium-4k-instruct \\
+                     | hf_benchmarks.py model-results --stdin --format ndjson
+
+              5) Use hf CLI for model discovery, then enrich with this tool:
+                   hf models list --search 'Phi-3' --filter eval-results --limit 5 --format json \\
+                     | jq -r '.[].id' \\
+                     | hf_benchmarks.py model-results --stdin --format table
+
+              6) Use hf CLI for dataset discovery, then fetch leaderboards:
+                   hf datasets list --search 'swe' --filter benchmark:official --limit 5 --format json \\
+                     | jq -r '.[].id' \\
+                     | hf_benchmarks.py leaderboard --stdin --top 5 --format table
             """
         ),
     )
@@ -550,6 +634,29 @@ def build_parser() -> argparse.ArgumentParser:
     leaderboard_parser = subparsers.add_parser(
         "leaderboard",
         help="Fetch normalized leaderboard rows for one or more benchmark datasets",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent(
+            """
+            Fetch normalized leaderboard rows for one or more benchmark datasets.
+
+            This command is designed to pair well with `hf datasets list`, where
+            `hf` handles benchmark dataset discovery and this tool handles
+            leaderboard retrieval / flattening.
+            """
+        ),
+        epilog=textwrap.dedent(
+            """
+            Examples:
+              hf_benchmarks.py leaderboard allenai/olmOCR-bench --top 10
+
+              printf '%s\\n' openai/gsm8k SWE-bench/SWE-bench_Verified \\
+                | hf_benchmarks.py leaderboard --stdin --top 5 --format ndjson
+
+              hf datasets list --search 'swe' --filter benchmark:official --limit 5 --format json \\
+                | jq -r '.[].id' \\
+                | hf_benchmarks.py leaderboard --stdin --top 5 --format table
+            """
+        ),
     )
     leaderboard_parser.add_argument(
         "datasets",
@@ -579,8 +686,71 @@ def build_parser() -> argparse.ArgumentParser:
         help="Output format (default: table).",
     )
 
+    model_results_parser = subparsers.add_parser(
+        "model-results",
+        help="Fetch normalized evalResults rows for one or more models",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent(
+            """
+            Fetch normalized evalResults rows for one or more model repos.
+
+            This command is designed to pair well with `hf models list`, where
+            `hf` handles discovery and this tool handles flattening / filtering
+            per-model benchmark results.
+            """
+        ),
+        epilog=textwrap.dedent(
+            """
+            Examples:
+              hf_benchmarks.py model-results Qwen/Qwen3.5-9B
+
+              printf '%s\\n' Qwen/Qwen3.5-9B microsoft/Phi-3-medium-4k-instruct \\
+                | hf_benchmarks.py model-results --stdin --format ndjson
+
+              hf models list --search 'Phi-3' --filter eval-results --limit 5 --format json \\
+                | jq -r '.[].id' \\
+                | hf_benchmarks.py model-results --stdin --dataset openai/gsm8k --format table
+            """
+        ),
+    )
+    model_results_parser.add_argument(
+        "models",
+        nargs="*",
+        help="Model repo ids (<namespace>/<repo>). Can also be supplied via stdin with --stdin.",
+    )
+    model_results_parser.add_argument(
+        "--stdin",
+        action="store_true",
+        help="Read model ids from stdin. Accepts plain repo ids or NDJSON with model_id/id fields.",
+    )
+    model_results_parser.add_argument(
+        "--dataset",
+        action="append",
+        default=[],
+        help="Only keep eval rows whose dataset_id matches one of these values. Repeatable.",
+    )
+    model_results_parser.add_argument(
+        "--task-id",
+        action="append",
+        default=[],
+        help="Only keep eval rows whose task_id matches one of these values. Repeatable.",
+    )
+    model_results_parser.add_argument(
+        "--top",
+        type=int,
+        default=None,
+        help="Only keep the top N eval rows per model after filtering.",
+    )
+    model_results_parser.add_argument(
+        "--format",
+        choices=["table", "json", "ndjson"],
+        default="table",
+        help="Output format (default: table).",
+    )
+
     parser._search_parser = search_parser
     parser._leaderboard_parser = leaderboard_parser
+    parser._model_results_parser = model_results_parser
 
     return parser
 
@@ -606,7 +776,7 @@ def run_search(args: argparse.Namespace) -> int:
 def run_leaderboard(args: argparse.Namespace) -> int:
     repo_ids = list(args.datasets)
     if args.stdin:
-        repo_ids.extend(read_repo_ids_from_stdin())
+        repo_ids.extend(read_repo_ids_from_stdin(json_keys=["dataset_id", "id"]))
 
     deduped: list[str] = []
     seen: set[str] = set()
@@ -636,6 +806,46 @@ def run_leaderboard(args: argparse.Namespace) -> int:
     return 0
 
 
+def run_model_results(args: argparse.Namespace) -> int:
+    model_ids = list(args.models)
+    if args.stdin:
+        model_ids.extend(read_repo_ids_from_stdin(json_keys=["model_id", "id"]))
+
+    deduped: list[str] = []
+    seen: set[str] = set()
+    for model_id in model_ids:
+        if model_id not in seen:
+            deduped.append(model_id)
+            seen.add(model_id)
+    model_ids = deduped
+
+    if not model_ids:
+        print("Error: provide model ids or use --stdin.", file=sys.stderr)
+        return 2
+
+    dataset_filters = set(args.dataset or [])
+    task_filters = set(args.task_id or [])
+
+    rows: list[dict[str, Any]] = []
+    for model_id in model_ids:
+        model_rows = get_model_results(model_id)
+        if dataset_filters:
+            model_rows = [row for row in model_rows if row.get("dataset_id") in dataset_filters]
+        if task_filters:
+            model_rows = [row for row in model_rows if row.get("task_id") in task_filters]
+        if args.top is not None:
+            model_rows = model_rows[: args.top]
+        rows.extend(model_rows)
+
+    if args.format == "json":
+        print_json(rows)
+    elif args.format == "ndjson":
+        print_ndjson(rows)
+    else:
+        print_model_results_table(rows)
+    return 0
+
+
 def main() -> int:
     parser = build_parser()
     args = parser.parse_args()
@@ -645,6 +855,8 @@ def main() -> int:
             return run_search(args)
         if args.command == "leaderboard":
             return run_leaderboard(args)
+        if args.command == "model-results":
+            return run_model_results(args)
         parser.error(f"Unknown command: {args.command}")
         return 2
     except HfApiError as exc: