Skip to content

Commit 749c499

Browse files
authored
[feat]add skills graders (#152)
* add skills graders * Skills graders: streamline tests and fix pre-commit hygiene - Refine skill grader modules (comprehensive, pairwise, relevance, safety) and exports - Simplify skill grader test suites with less boilerplate - Ignore tests/graders/skills/*.json artifacts in .gitignore - Run pytest in pre-commit via uv so the project environment provides pytest - Fix flake8 E226 spacing and long-line issues in cookbooks, experiments, UI batch panels Made-with: Cursor * feat(graders): add LLMGrader system prompts to skill PromptTemplates Align skill graders with tool_call_accuracy: each default template now uses a system message (LLMGrader.SYSTEM_PROMPT_EN/ZH) plus the existing user rubric. Made-with: Cursor * chore: align .pre-commit-config.yaml with main Use python -m pytest for the local pytest hook (no uv in CI). Match main hooks: drop gitleaks and detect-aws-credentials extras from the feature branch. Made-with: Cursor
1 parent af01a30 commit 749c499

25 files changed

+8046
-27
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ local/
3838

3939
# Test Artifacts
4040
tests/plt_*
41+
tests/graders/skills/*.json
4142

4243
# Security Files
4344
*.pem

.pre-commit-config.yaml

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
11
repos:
2-
- repo: https://github.com/gitleaks/gitleaks
3-
rev: v8.18.4
4-
hooks:
5-
- id: gitleaks
62
- repo: https://github.com/pre-commit/pre-commit-hooks
73
rev: v6.0.0
84
hooks:
@@ -12,8 +8,6 @@ repos:
128
- id: check-toml
139
- id: check-json
1410
- id: detect-private-key
15-
- id: detect-aws-credentials
16-
args: ['--allow-missing-credentials']
1711
- id: trailing-whitespace
1812
- repo: https://github.com/PyCQA/autoflake
1913
rev: v2.2.1
@@ -111,4 +105,4 @@ repos:
111105
"--ignore=tests/generator",
112106
"--ignore=data",
113107
"--ignore=examples"
114-
]
108+
]

cookbooks/auto_arena/report_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def _prepare_context(
6969
original_details = [d for d in details if d.order == "original"]
7070

7171
# Format rankings
72-
rankings_text = "\n".join(f"{i+1}. {name}: {rate:.1%}" for i, (name, rate) in enumerate(result.rankings))
72+
rankings_text = "\n".join(f"{i + 1}. {name}: {rate:.1%}" for i, (name, rate) in enumerate(result.rankings))
7373

7474
# Format rubrics
7575
rubrics_text = "\n".join(f"- {r}" for r in rubrics)

cookbooks/ref_hallucination_arena/collectors/bib_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ def _parse_plain_text(self, text: str) -> List[Reference]:
293293

294294
refs.append(
295295
Reference(
296-
key=f"ref_{len(refs)+1}",
296+
key=f"ref_{len(refs) + 1}",
297297
title=title.strip(),
298298
authors=authors.strip() if authors else None,
299299
year=year.strip(),

cookbooks/ref_hallucination_arena/pipeline.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ def _verify_one(model_name: str, idx: int) -> None:
545545
total_v = sum(m.verified for m in model_results)
546546
total_r = sum(m.total_refs for m in model_results)
547547
if total_r > 0:
548-
logger.info(f" {model_name}: {total_v}/{total_r} verified " f"({total_v/total_r:.1%})")
548+
logger.info(f" {model_name}: {total_v}/{total_r} verified " f"({total_v / total_r:.1%})")
549549
else:
550550
logger.info(f" {model_name}: 0 refs")
551551

@@ -735,7 +735,7 @@ def _on_query_complete(local_idx: int, result_dict: dict) -> None:
735735
tv = sum(m.verified for m in mr)
736736
tr = sum(m.total_refs for m in mr)
737737
if tr > 0:
738-
logger.info(f" {mn}: {tv}/{tr} verified ({tv/tr:.1%})")
738+
logger.info(f" {mn}: {tv}/{tr} verified ({tv / tr:.1%})")
739739
else:
740740
logger.info(f" {mn}: 0 refs")
741741

experiments/run_grader_evaluations.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -342,16 +342,16 @@ async def run_all_evaluations(
342342
"agent": agent_model,
343343
}
344344

345-
print(f"\n{'='*70}")
345+
print(f"\n{'=' * 70}")
346346
print("OpenJudge Grader Evaluation Suite")
347-
print(f"{'='*70}")
347+
print(f"{'=' * 70}")
348348
print(f"Categories: {', '.join(categories)}")
349349
print(f"Text Model: {text_model}")
350350
print(f"Multimodal Model: {multimodal_model}")
351351
print(f"Agent Model: {agent_model}")
352352
print(f"Max Workers: {max_workers}")
353353
print(f"Total Evaluations: {len(configs_to_run)}")
354-
print(f"{'='*70}\n")
354+
print(f"{'=' * 70}\n")
355355

356356
# Create semaphore for concurrency control
357357
semaphore = asyncio.Semaphore(max_workers)

openjudge/graders/common/search_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ async def aexecute(self, query: str, search_depth: str = "advanced", **kwargs) -
7373
for i, r in enumerate(results[:5]):
7474
content = r.get("content", "")[:1500]
7575
summary_parts.append(
76-
f"[{i+1}] {r.get('title', '')}\n" f"URL: {r.get('url', '')}\n" f"Content: {content}"
76+
f"[{i + 1}] {r.get('title', '')}\n" f"URL: {r.get('url', '')}\n" f"Content: {content}"
7777
)
7878

7979
return ToolResult(
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Skill Graders
4+
5+
This module contains graders for evaluating AI Agent Skill packages:
6+
- Safety evaluation: detects dangerous operations, overly broad permissions, and missing safeguards
7+
- Relevance evaluation: measures how well a skill's capabilities address a task description
8+
- Completeness evaluation: measures whether a skill provides sufficient detail to accomplish a task
9+
- Structure evaluation: assesses structural design quality across anti-pattern quality,
10+
specification compliance, progressive disclosure, and freedom calibration
11+
- Comprehensive evaluation: holistic multi-dimensional assessment combining all four dimensions
12+
- Comprehensive pairwise evaluation: head-to-head comparison of two skill candidates
13+
"""
14+
15+
from openjudge.graders.skills.completeness import SkillCompletenessGrader
16+
from openjudge.graders.skills.comprehensive import SkillComprehensiveGrader
17+
from openjudge.graders.skills.comprehensive_pairwise import (
18+
SkillComprehensivePairwiseGrader,
19+
)
20+
from openjudge.graders.skills.relevance import SkillRelevanceGrader
21+
from openjudge.graders.skills.safety import SkillSafetyGrader
22+
from openjudge.graders.skills.structure import SkillStructureGrader
23+
24+
__all__ = [
25+
"SkillSafetyGrader",
26+
"SkillRelevanceGrader",
27+
"SkillCompletenessGrader",
28+
"SkillStructureGrader",
29+
"SkillComprehensiveGrader",
30+
"SkillComprehensivePairwiseGrader",
31+
]

0 commit comments

Comments
 (0)