Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
286a1e7
feat: add report generator and update zero-shot evaluation pipeline
XiaoBoAI Jan 6, 2026
a1d84d5
chore: 移除未使用的logger导入
XiaoBoAI Jan 6, 2026
7c36ce4
fix: 修复代码格式和尾部空白问题
XiaoBoAI Jan 6, 2026
a0b8df9
refactor(zero-shot): extract magic numbers to constants and improve t…
XiaoBoAI Jan 7, 2026
167c611
docs: move evaluation report example to docs and exclude evaluation_r…
XiaoBoAI Jan 7, 2026
3aaeb2a
docs: update example report to English and fix documentation
XiaoBoAI Jan 7, 2026
94e48ce
docs: translate sample report structure to English
XiaoBoAI Jan 7, 2026
bd0eed4
docs: rename examples folder to sample_reports
XiaoBoAI Jan 7, 2026
d9b7ae4
docs: move evaluation report example to docs and exclude evaluation_r…
XiaoBoAI Jan 7, 2026
67698a6
docs: update example report to English and fix documentation
XiaoBoAI Jan 7, 2026
da552c9
docs: translate sample report structure to English
XiaoBoAI Jan 7, 2026
3aa080b
docs: rename examples folder to sample_reports
XiaoBoAI Jan 7, 2026
90a6e2e
style(docs): remove trailing whitespace
XiaoBoAI Jan 7, 2026
8947d56
docs(applications): improve zero-shot evaluation doc with tabs and wo…
XiaoBoAI Jan 7, 2026
0faaff8
feat(models): add base configuration classes for local model providers
XiaoBoAI Jan 7, 2026
b961fe0
docs(zero-shot): Streamline evaluation documentation structure
XiaoBoAI Jan 7, 2026
372bf85
Merge branch 'docs/sample-reports' into feature/zero-shot-evaluation
XiaoBoAI Jan 8, 2026
2989a3f
Merge branch 'main' into feature/zero-shot-evaluation
XiaoBoAI Jan 8, 2026
f53d579
fix
XiaoBoAI Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion cookbooks/zero_shot_evaluation/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Literal, Optional, Union

import yaml
from loguru import logger
Expand Down Expand Up @@ -92,6 +92,14 @@ class OutputConfig(BaseModel):
output_dir: str = Field(default="./evaluation_results", description="Output directory")


class ReportConfig(BaseModel):
"""Report generation configuration."""

enabled: bool = Field(default=False, description="Whether to generate report")
language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en")
include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section")


class ZeroShotConfig(BaseModel):
"""Complete zero-shot evaluation configuration."""

Expand All @@ -101,6 +109,7 @@ class ZeroShotConfig(BaseModel):
query_generation: QueryGenerationConfig = Field(default_factory=QueryGenerationConfig)
evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig)
output: OutputConfig = Field(default_factory=OutputConfig)
report: ReportConfig = Field(default_factory=ReportConfig)


class GeneratedQuery(BaseModel):
Expand All @@ -118,6 +127,20 @@ class QueryGenerationOutput(BaseModel):
reason: str = Field(default="", description="Generation reasoning")


class ComparisonDetail(BaseModel):
"""Single pairwise comparison detail."""

query: str = Field(..., description="Original query")
model_a: str = Field(..., description="Model A name")
model_b: str = Field(..., description="Model B name")
response_a: str = Field(..., description="Model A response")
response_b: str = Field(..., description="Model B response")
winner: str = Field(..., description="Winner: model_a | model_b")
score: float = Field(..., description="Score (1.0=A wins, 0.0=B wins)")
reason: str = Field(default="", description="Evaluation reason")
order: str = Field(default="original", description="Comparison order: original | swapped")


# =============================================================================
# Configuration Loading
# =============================================================================
Expand Down
87 changes: 82 additions & 5 deletions cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
from cookbooks.zero_shot_evaluation.schema import (
ComparisonDetail,
GeneratedQuery,
OpenAIEndpoint,
ZeroShotConfig,
Expand All @@ -36,7 +37,7 @@
from openjudge.analyzer import PairwiseAnalysisResult, PairwiseAnalyzer
from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
from openjudge.graders.llm_grader import GraderMode, LLMGrader
from openjudge.graders.schema import GraderResult
from openjudge.graders.schema import GraderError, GraderResult
from openjudge.models.openai_chat_model import OpenAIChatModel
from openjudge.models.schema.oai.message import ChatMessage
from openjudge.models.schema.prompt_template import PromptTemplate
Expand Down Expand Up @@ -83,6 +84,7 @@ class _CheckpointManager:
QUERIES_FILE = "queries.json"
RESPONSES_FILE = "responses.json"
RUBRICS_FILE = "rubrics.json"
DETAILS_FILE = "comparison_details.json"

def __init__(self, output_dir: str):
"""Initialize checkpoint manager.
Expand Down Expand Up @@ -194,6 +196,23 @@ def load_rubrics(self) -> List[str]:
logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}")
return rubrics

def save_comparison_details(self, details: List[ComparisonDetail]) -> str:
"""Save comparison details."""
file_path = self.output_dir / self.DETAILS_FILE
with open(file_path, "w", encoding="utf-8") as f:
json.dump([d.model_dump() for d in details], f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(details)} comparison details to {file_path}")
return str(file_path)

def load_comparison_details(self) -> List[ComparisonDetail]:
"""Load saved comparison details."""
file_path = self.output_dir / self.DETAILS_FILE
if not file_path.exists():
return []
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
return [ComparisonDetail(**item) for item in data]

def update_stage(
self,
stage: EvaluationStage,
Expand All @@ -217,6 +236,7 @@ def clear(self) -> None:
self.QUERIES_FILE,
self.RESPONSES_FILE,
self.RUBRICS_FILE,
self.DETAILS_FILE,
]:
file_path = self.output_dir / file_name
if file_path.exists():
Expand Down Expand Up @@ -346,6 +366,7 @@ def __init__(
self._queries: List[GeneratedQuery] = []
self._responses: List[Dict[str, Any]] = []
self._rubrics: List[str] = []
self._comparison_details: List[ComparisonDetail] = []

# Initialize checkpoint manager
self._checkpoint_mgr = _CheckpointManager(self.config.output.output_dir)
Expand Down Expand Up @@ -527,8 +548,8 @@ async def _run_pairwise_evaluation(
self,
dataset: List[dict],
rubrics: List[str],
) -> List[GraderResult]:
"""Run pairwise evaluation using GradingRunner."""
) -> Tuple[List[GraderResult], List[ComparisonDetail]]:
"""Run pairwise evaluation and collect comparison details."""
grader = self._build_pairwise_grader(rubrics)

mapper = {
Expand All @@ -546,7 +567,31 @@ async def _run_pairwise_evaluation(

logger.info(f"Running {len(dataset)} pairwise comparisons...")
results = await runner.arun(dataset)
return results["pairwise"]
grader_results = results["pairwise"]

# Collect comparison details (skip GraderError results)
details = []
for sample, result in zip(dataset, grader_results):
if isinstance(result, GraderError):
continue
score = getattr(result, "score", None)
if score is None:
continue
details.append(
ComparisonDetail(
query=sample["evaluation_data"]["instruction"],
model_a=sample["metadata"]["model_a"],
model_b=sample["metadata"]["model_b"],
response_a=sample["evaluation_data"]["response_a"],
response_b=sample["evaluation_data"]["response_b"],
winner="model_a" if score >= 0.5 else "model_b",
score=score,
reason=getattr(result, "reason", ""),
order=sample["metadata"].get("order", "original"),
)
)

return grader_results, details

def _analyze_results(
self,
Expand Down Expand Up @@ -635,7 +680,10 @@ async def evaluate(
if not dataset:
raise ValueError("No valid comparison pairs. Check if responses were collected successfully.")

grader_results = await self._run_pairwise_evaluation(dataset, self._rubrics)
grader_results, self._comparison_details = await self._run_pairwise_evaluation(dataset, self._rubrics)

# Save comparison details
self._checkpoint_mgr.save_comparison_details(self._comparison_details)

# Step 5: Analyze results using OpenJudge's PairwiseAnalyzer
logger.info("Step 5: Analyzing results...")
Expand All @@ -649,8 +697,37 @@ async def evaluate(
)

self._display_results(result)

# Step 6: Generate report if enabled
if self.config.report.enabled:
await self._generate_and_save_report(result)

return result

async def _generate_and_save_report(self, result: EvaluationResult) -> None:
"""Generate and save evaluation report."""
from cookbooks.zero_shot_evaluation.report_generator import ReportGenerator

logger.info("Step 6: Generating evaluation report...")
generator = ReportGenerator(
judge_endpoint=self.config.judge_endpoint,
language=self.config.report.language,
include_examples=self.config.report.include_examples,
)
report = await generator.generate(
task_config=self.config.task,
rubrics=self._rubrics,
result=result,
details=self._comparison_details,
)

# Save report
output_dir = Path(self.config.output.output_dir)
report_path = output_dir / "evaluation_report.md"
with open(report_path, "w", encoding="utf-8") as f:
f.write(report)
logger.info(f"Report saved to {report_path}")

def _display_results(self, result: EvaluationResult) -> None:
"""Display evaluation results with formatted output."""
endpoint_names = list(self.config.target_endpoints.keys())
Expand Down
Loading