Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 10 additions & 12 deletions skills/huggingface-llm-trainer/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,20 +424,18 @@ Before submitting:

**Identify models to train based on task type or benchmark results.**

Use `scripts/hf_benchmarks.py` to identify top-performing models for specific tasks. This helps the user select a model as the base for training, whilst keeping size and hardware constraints in mind.
Use `scripts/hf_benchmarks.py` to help choose a base model while keeping task fit, model size, and hardware constraints in mind.

```bash
# Get help on the benchmarks command:
uv run scripts/hf_benchmarks.py --help
```
Capabilities:
- search official benchmark datasets by free text, alias, task, and modality
- fetch normalized leaderboard rows for benchmark datasets
- fetch normalized `evalResults` rows for one or more candidate models
- work well in pipelines via stdin plus table / JSON / NDJSON output

### Example -- choosing an OCR base model
```bash
# Search for benchmarks containing whose name contains the text `ocr`
uv run scripts/hf_benchmarks.py search --query ocr
For command details, examples, and flags, use:

# Get the ranked leaderboard for the allenai/olmOCR-bench benchmark
uv run scripts/hf_benchmarks.py leaderboard allenai/olmOCR-bench
```bash
uv run scripts/hf_benchmarks.py --help
```

## Cost Estimation
Expand Down Expand Up @@ -710,7 +708,7 @@ Add to PEP 723 header:
- `scripts/unsloth_sft_example.py` - Unsloth text LLM training template (faster, less VRAM)
- `scripts/estimate_cost.py` - Estimate time and cost (offer when appropriate)
- `scripts/convert_to_gguf.py` - Complete GGUF conversion script
- `scripts/hf_benchmarks.py` - Search for benchmark results and leaderboards by task, alias or free text.
- `scripts/hf_benchmarks.py` - Search benchmark datasets, fetch dataset leaderboards, and inspect model `evalResults`.

### External Scripts
- [Dataset Inspector](https://huggingface.co/datasets/mcp-tools/skills/raw/main/dataset_inspector.py) - Validate dataset format before training (use via `uv run` or `hf_jobs`)
Expand Down
218 changes: 215 additions & 3 deletions skills/huggingface-llm-trainer/scripts/hf_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._search_parser: argparse.ArgumentParser | None = None
self._leaderboard_parser: argparse.ArgumentParser | None = None
self._model_results_parser: argparse.ArgumentParser | None = None

def format_help(self) -> str:
text = super().format_help()
Expand All @@ -113,6 +114,12 @@ def format_help(self) -> str:
+ textwrap.indent(self._leaderboard_parser.format_help().strip(), " ")
)

if self._model_results_parser is not None:
extra_sections.append(
"\nmodel-results command options:\n"
+ textwrap.indent(self._model_results_parser.format_help().strip(), " ")
)

if extra_sections:
text += "\n" + "\n".join(extra_sections) + "\n"
return text
Expand Down Expand Up @@ -401,10 +408,47 @@ def get_leaderboard(repo_id: str, task_id: str | None = None) -> list[dict[str,
return normalized


def read_repo_ids_from_stdin() -> list[str]:
def get_model_results(model_id: str) -> list[dict[str, Any]]:
namespace, repo = parse_repo_id(model_id)
data = http_get_json(
f"/api/models/{namespace}/{repo}",
params={"expand[]": "evalResults"},
)
if not isinstance(data, dict):
raise HfApiError(f"Unexpected model response for {model_id}")

eval_results = data.get("evalResults") or []
if not isinstance(eval_results, list):
raise HfApiError(f"Unexpected evalResults payload for {model_id}")

normalized: list[dict[str, Any]] = []
for row in eval_results:
payload = row.get("data") or {}
dataset = payload.get("dataset") or {}
source = payload.get("source") or {}
normalized.append(
{
"model_id": model_id,
"dataset_id": dataset.get("id"),
"task_id": dataset.get("task_id"),
"value": payload.get("value"),
"date": payload.get("date"),
"verified": row.get("verified"),
"filename": row.get("filename"),
"notes": payload.get("notes"),
"pull_request": row.get("pullRequest"),
"source_name": source.get("name"),
"source_url": source.get("url"),
}
)
return normalized


def read_repo_ids_from_stdin(*, json_keys: Iterable[str]) -> list[str]:
if sys.stdin.isatty():
return []

key_list = list(json_keys)
repo_ids: list[str] = []
for raw_line in sys.stdin:
line = raw_line.strip()
Expand All @@ -415,7 +459,12 @@ def read_repo_ids_from_stdin() -> list[str]:
obj = json.loads(line)
except json.JSONDecodeError:
continue
candidate = obj.get("dataset_id") or obj.get("id")
candidate = None
for key in key_list:
value = obj.get(key)
if isinstance(value, str) and "/" in value:
candidate = value
break
if isinstance(candidate, str) and "/" in candidate:
repo_ids.append(candidate)
continue
Expand Down Expand Up @@ -476,6 +525,27 @@ def print_leaderboard_table(rows: list[dict[str, Any]]) -> None:
print(" ".join(v.ljust(w) for v, w in zip(values, widths)))


def print_model_results_table(rows: list[dict[str, Any]]) -> None:
if not rows:
print("No model eval rows returned.")
return

headers = ["model_id", "dataset_id", "task_id", "value", "date", "verified"]
widths = [34, 30, 22, 10, 12, 8]
print(" ".join(h.ljust(w) for h, w in zip(headers, widths)))
print(" ".join("-" * w for w in widths))
for row in rows:
values = [
shorten(str(row.get("model_id") or ""), widths[0]),
shorten(str(row.get("dataset_id") or ""), widths[1]),
shorten(str(row.get("task_id") or ""), widths[2]),
shorten(str(row.get("value") or ""), widths[3]),
shorten(str(row.get("date") or ""), widths[4]),
str(row.get("verified")),
]
print(" ".join(v.ljust(w) for v, w in zip(values, widths)))


def build_parser() -> argparse.ArgumentParser:
parser = FullHelpArgumentParser(
prog="hf_benchmarks.py",
Expand All @@ -496,6 +566,20 @@ def build_parser() -> argparse.ArgumentParser:
3) Chain search -> leaderboard:
hf_benchmarks.py search --alias coding --format ndjson \\
| hf_benchmarks.py leaderboard --stdin --top 5 --format table

4) Fetch eval results for a list of models:
printf '%s\\n' Qwen/Qwen3.5-9B microsoft/Phi-3-medium-4k-instruct \\
| hf_benchmarks.py model-results --stdin --format ndjson

5) Use hf CLI for model discovery, then enrich with this tool:
hf models list --search 'Phi-3' --filter eval-results --limit 5 --format json \\
| jq -r '.[].id' \\
| hf_benchmarks.py model-results --stdin --format table

6) Use hf CLI for dataset discovery, then fetch leaderboards:
hf datasets list --search 'swe' --filter benchmark:official --limit 5 --format json \\
| jq -r '.[].id' \\
| hf_benchmarks.py leaderboard --stdin --top 5 --format table
"""
),
)
Expand Down Expand Up @@ -550,6 +634,29 @@ def build_parser() -> argparse.ArgumentParser:
leaderboard_parser = subparsers.add_parser(
"leaderboard",
help="Fetch normalized leaderboard rows for one or more benchmark datasets",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(
"""
Fetch normalized leaderboard rows for one or more benchmark datasets.

This command is designed to pair well with `hf datasets list`, where
`hf` handles benchmark dataset discovery and this tool handles
leaderboard retrieval / flattening.
"""
),
epilog=textwrap.dedent(
"""
Examples:
hf_benchmarks.py leaderboard allenai/olmOCR-bench --top 10

printf '%s\\n' openai/gsm8k SWE-bench/SWE-bench_Verified \\
| hf_benchmarks.py leaderboard --stdin --top 5 --format ndjson

hf datasets list --search 'swe' --filter benchmark:official --limit 5 --format json \\
| jq -r '.[].id' \\
| hf_benchmarks.py leaderboard --stdin --top 5 --format table
"""
),
)
leaderboard_parser.add_argument(
"datasets",
Expand Down Expand Up @@ -579,8 +686,71 @@ def build_parser() -> argparse.ArgumentParser:
help="Output format (default: table).",
)

model_results_parser = subparsers.add_parser(
"model-results",
help="Fetch normalized evalResults rows for one or more models",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(
"""
Fetch normalized evalResults rows for one or more model repos.

This command is designed to pair well with `hf models list`, where
`hf` handles discovery and this tool handles flattening / filtering
per-model benchmark results.
"""
),
epilog=textwrap.dedent(
"""
Examples:
hf_benchmarks.py model-results Qwen/Qwen3.5-9B

printf '%s\\n' Qwen/Qwen3.5-9B microsoft/Phi-3-medium-4k-instruct \\
| hf_benchmarks.py model-results --stdin --format ndjson

hf models list --search 'Phi-3' --filter eval-results --limit 5 --format json \\
| jq -r '.[].id' \\
| hf_benchmarks.py model-results --stdin --dataset openai/gsm8k --format table
"""
),
)
model_results_parser.add_argument(
"models",
nargs="*",
help="Model repo ids (<namespace>/<repo>). Can also be supplied via stdin with --stdin.",
)
model_results_parser.add_argument(
"--stdin",
action="store_true",
help="Read model ids from stdin. Accepts plain repo ids or NDJSON with model_id/id fields.",
)
model_results_parser.add_argument(
"--dataset",
action="append",
default=[],
help="Only keep eval rows whose dataset_id matches one of these values. Repeatable.",
)
model_results_parser.add_argument(
"--task-id",
action="append",
default=[],
help="Only keep eval rows whose task_id matches one of these values. Repeatable.",
)
model_results_parser.add_argument(
"--top",
type=int,
default=None,
help="Only keep the top N eval rows per model after filtering.",
)
model_results_parser.add_argument(
"--format",
choices=["table", "json", "ndjson"],
default="table",
help="Output format (default: table).",
)

parser._search_parser = search_parser
parser._leaderboard_parser = leaderboard_parser
parser._model_results_parser = model_results_parser

return parser

Expand All @@ -606,7 +776,7 @@ def run_search(args: argparse.Namespace) -> int:
def run_leaderboard(args: argparse.Namespace) -> int:
repo_ids = list(args.datasets)
if args.stdin:
repo_ids.extend(read_repo_ids_from_stdin())
repo_ids.extend(read_repo_ids_from_stdin(json_keys=["dataset_id", "id"]))

deduped: list[str] = []
seen: set[str] = set()
Expand Down Expand Up @@ -636,6 +806,46 @@ def run_leaderboard(args: argparse.Namespace) -> int:
return 0


def run_model_results(args: argparse.Namespace) -> int:
model_ids = list(args.models)
if args.stdin:
model_ids.extend(read_repo_ids_from_stdin(json_keys=["model_id", "id"]))

deduped: list[str] = []
seen: set[str] = set()
for model_id in model_ids:
if model_id not in seen:
deduped.append(model_id)
seen.add(model_id)
model_ids = deduped

if not model_ids:
print("Error: provide model ids or use --stdin.", file=sys.stderr)
return 2

dataset_filters = set(args.dataset or [])
task_filters = set(args.task_id or [])

rows: list[dict[str, Any]] = []
for model_id in model_ids:
model_rows = get_model_results(model_id)
if dataset_filters:
model_rows = [row for row in model_rows if row.get("dataset_id") in dataset_filters]
if task_filters:
model_rows = [row for row in model_rows if row.get("task_id") in task_filters]
if args.top is not None:
model_rows = model_rows[: args.top]
rows.extend(model_rows)

if args.format == "json":
print_json(rows)
elif args.format == "ndjson":
print_ndjson(rows)
else:
print_model_results_table(rows)
return 0


def main() -> int:
parser = build_parser()
args = parser.parse_args()
Expand All @@ -645,6 +855,8 @@ def main() -> int:
return run_search(args)
if args.command == "leaderboard":
return run_leaderboard(args)
if args.command == "model-results":
return run_model_results(args)
parser.error(f"Unknown command: {args.command}")
return 2
except HfApiError as exc:
Expand Down