Skip to content

Commit 0d896a2

Browse files
authored
feat: add report generator and update zero-shot evaluation pipeline (#32)
* feat: add report generator and update zero-shot evaluation pipeline - Add report_generator.py for generating evaluation reports - Update schema.py with new data structures - Update zero_shot_pipeline.py with report generation support - Add example evaluation report for oncology translation - Update zero_shot_evaluation documentation * chore: 移除未使用的logger导入 * fix: 修复代码格式和尾部空白问题 * refactor(zero-shot): extract magic numbers to constants and improve type safety * docs: move evaluation report example to docs and exclude evaluation_results from git - Move oncology_translation_report.md to docs/applications/examples/ - Add reference to example report in zero_shot_evaluation.md - Add /evaluation_results to .gitignore - Fix examples/ gitignore rule to only ignore root-level examples folder * docs: update example report to English and fix documentation - Regenerate evaluation report in English using config.yaml - Update zero_shot_evaluation.md reference text to English - Replace Chinese example report with English version * docs: translate sample report structure to English * docs: rename examples folder to sample_reports * docs: move evaluation report example to docs and exclude evaluation_results from git - Move oncology_translation_report.md to docs/applications/examples/ - Add reference to example report in zero_shot_evaluation.md - Add /evaluation_results to .gitignore - Fix examples/ gitignore rule to only ignore root-level examples folder * docs: update example report to English and fix documentation - Regenerate evaluation report in English using config.yaml - Update zero_shot_evaluation.md reference text to English - Replace Chinese example report with English version * docs: translate sample report structure to English * docs: rename examples folder to sample_reports * style(docs): remove trailing whitespace * docs(applications): improve zero-shot evaluation doc with tabs and workflow styling * feat(models): add base configuration classes for local model providers * docs(zero-shot): Streamline evaluation documentation structure - Consolidate 'When to Use' and 'How It Works' into concise 'Overview' - Reorganize advanced topics into tabbed sections for better readability - Simplify configuration examples and reduce redundant explanations - Replace verbose 'Next Steps' with compact 'Related Topics' links * fix
1 parent 6128ca0 commit 0d896a2

File tree

3 files changed

+315
-302
lines changed

3 files changed

+315
-302
lines changed

cookbooks/zero_shot_evaluation/schema.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import os
1010
import re
1111
from pathlib import Path
12-
from typing import Any, Dict, List, Optional, Union
12+
from typing import Any, Dict, List, Literal, Optional, Union
1313

1414
import yaml
1515
from loguru import logger
@@ -92,6 +92,14 @@ class OutputConfig(BaseModel):
9292
output_dir: str = Field(default="./evaluation_results", description="Output directory")
9393

9494

95+
class ReportConfig(BaseModel):
96+
"""Report generation configuration."""
97+
98+
enabled: bool = Field(default=False, description="Whether to generate report")
99+
language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en")
100+
include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section")
101+
102+
95103
class ZeroShotConfig(BaseModel):
96104
"""Complete zero-shot evaluation configuration."""
97105

@@ -101,6 +109,7 @@ class ZeroShotConfig(BaseModel):
101109
query_generation: QueryGenerationConfig = Field(default_factory=QueryGenerationConfig)
102110
evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig)
103111
output: OutputConfig = Field(default_factory=OutputConfig)
112+
report: ReportConfig = Field(default_factory=ReportConfig)
104113

105114

106115
class GeneratedQuery(BaseModel):
@@ -118,6 +127,20 @@ class QueryGenerationOutput(BaseModel):
118127
reason: str = Field(default="", description="Generation reasoning")
119128

120129

130+
class ComparisonDetail(BaseModel):
131+
"""Single pairwise comparison detail."""
132+
133+
query: str = Field(..., description="Original query")
134+
model_a: str = Field(..., description="Model A name")
135+
model_b: str = Field(..., description="Model B name")
136+
response_a: str = Field(..., description="Model A response")
137+
response_b: str = Field(..., description="Model B response")
138+
winner: str = Field(..., description="Winner: model_a | model_b")
139+
score: float = Field(..., description="Score (1.0=A wins, 0.0=B wins)")
140+
reason: str = Field(default="", description="Evaluation reason")
141+
order: str = Field(default="original", description="Comparison order: original | swapped")
142+
143+
121144
# =============================================================================
122145
# Configuration Loading
123146
# =============================================================================

cookbooks/zero_shot_evaluation/zero_shot_pipeline.py

Lines changed: 82 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
2727
from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
2828
from cookbooks.zero_shot_evaluation.schema import (
29+
ComparisonDetail,
2930
GeneratedQuery,
3031
OpenAIEndpoint,
3132
ZeroShotConfig,
@@ -36,7 +37,7 @@
3637
from openjudge.analyzer import PairwiseAnalysisResult, PairwiseAnalyzer
3738
from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
3839
from openjudge.graders.llm_grader import GraderMode, LLMGrader
39-
from openjudge.graders.schema import GraderResult
40+
from openjudge.graders.schema import GraderError, GraderResult
4041
from openjudge.models.openai_chat_model import OpenAIChatModel
4142
from openjudge.models.schema.oai.message import ChatMessage
4243
from openjudge.models.schema.prompt_template import PromptTemplate
@@ -83,6 +84,7 @@ class _CheckpointManager:
8384
QUERIES_FILE = "queries.json"
8485
RESPONSES_FILE = "responses.json"
8586
RUBRICS_FILE = "rubrics.json"
87+
DETAILS_FILE = "comparison_details.json"
8688

8789
def __init__(self, output_dir: str):
8890
"""Initialize checkpoint manager.
@@ -194,6 +196,23 @@ def load_rubrics(self) -> List[str]:
194196
logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}")
195197
return rubrics
196198

199+
def save_comparison_details(self, details: List[ComparisonDetail]) -> str:
200+
"""Save comparison details."""
201+
file_path = self.output_dir / self.DETAILS_FILE
202+
with open(file_path, "w", encoding="utf-8") as f:
203+
json.dump([d.model_dump() for d in details], f, indent=2, ensure_ascii=False)
204+
logger.info(f"Saved {len(details)} comparison details to {file_path}")
205+
return str(file_path)
206+
207+
def load_comparison_details(self) -> List[ComparisonDetail]:
208+
"""Load saved comparison details."""
209+
file_path = self.output_dir / self.DETAILS_FILE
210+
if not file_path.exists():
211+
return []
212+
with open(file_path, "r", encoding="utf-8") as f:
213+
data = json.load(f)
214+
return [ComparisonDetail(**item) for item in data]
215+
197216
def update_stage(
198217
self,
199218
stage: EvaluationStage,
@@ -217,6 +236,7 @@ def clear(self) -> None:
217236
self.QUERIES_FILE,
218237
self.RESPONSES_FILE,
219238
self.RUBRICS_FILE,
239+
self.DETAILS_FILE,
220240
]:
221241
file_path = self.output_dir / file_name
222242
if file_path.exists():
@@ -346,6 +366,7 @@ def __init__(
346366
self._queries: List[GeneratedQuery] = []
347367
self._responses: List[Dict[str, Any]] = []
348368
self._rubrics: List[str] = []
369+
self._comparison_details: List[ComparisonDetail] = []
349370

350371
# Initialize checkpoint manager
351372
self._checkpoint_mgr = _CheckpointManager(self.config.output.output_dir)
@@ -527,8 +548,8 @@ async def _run_pairwise_evaluation(
527548
self,
528549
dataset: List[dict],
529550
rubrics: List[str],
530-
) -> List[GraderResult]:
531-
"""Run pairwise evaluation using GradingRunner."""
551+
) -> Tuple[List[GraderResult], List[ComparisonDetail]]:
552+
"""Run pairwise evaluation and collect comparison details."""
532553
grader = self._build_pairwise_grader(rubrics)
533554

534555
mapper = {
@@ -546,7 +567,31 @@ async def _run_pairwise_evaluation(
546567

547568
logger.info(f"Running {len(dataset)} pairwise comparisons...")
548569
results = await runner.arun(dataset)
549-
return results["pairwise"]
570+
grader_results = results["pairwise"]
571+
572+
# Collect comparison details (skip GraderError results)
573+
details = []
574+
for sample, result in zip(dataset, grader_results):
575+
if isinstance(result, GraderError):
576+
continue
577+
score = getattr(result, "score", None)
578+
if score is None:
579+
continue
580+
details.append(
581+
ComparisonDetail(
582+
query=sample["evaluation_data"]["instruction"],
583+
model_a=sample["metadata"]["model_a"],
584+
model_b=sample["metadata"]["model_b"],
585+
response_a=sample["evaluation_data"]["response_a"],
586+
response_b=sample["evaluation_data"]["response_b"],
587+
winner="model_a" if score >= 0.5 else "model_b",
588+
score=score,
589+
reason=getattr(result, "reason", ""),
590+
order=sample["metadata"].get("order", "original"),
591+
)
592+
)
593+
594+
return grader_results, details
550595

551596
def _analyze_results(
552597
self,
@@ -635,7 +680,10 @@ async def evaluate(
635680
if not dataset:
636681
raise ValueError("No valid comparison pairs. Check if responses were collected successfully.")
637682

638-
grader_results = await self._run_pairwise_evaluation(dataset, self._rubrics)
683+
grader_results, self._comparison_details = await self._run_pairwise_evaluation(dataset, self._rubrics)
684+
685+
# Save comparison details
686+
self._checkpoint_mgr.save_comparison_details(self._comparison_details)
639687

640688
# Step 5: Analyze results using OpenJudge's PairwiseAnalyzer
641689
logger.info("Step 5: Analyzing results...")
@@ -649,8 +697,37 @@ async def evaluate(
649697
)
650698

651699
self._display_results(result)
700+
701+
# Step 6: Generate report if enabled
702+
if self.config.report.enabled:
703+
await self._generate_and_save_report(result)
704+
652705
return result
653706

707+
async def _generate_and_save_report(self, result: EvaluationResult) -> None:
708+
"""Generate and save evaluation report."""
709+
from cookbooks.zero_shot_evaluation.report_generator import ReportGenerator
710+
711+
logger.info("Step 6: Generating evaluation report...")
712+
generator = ReportGenerator(
713+
judge_endpoint=self.config.judge_endpoint,
714+
language=self.config.report.language,
715+
include_examples=self.config.report.include_examples,
716+
)
717+
report = await generator.generate(
718+
task_config=self.config.task,
719+
rubrics=self._rubrics,
720+
result=result,
721+
details=self._comparison_details,
722+
)
723+
724+
# Save report
725+
output_dir = Path(self.config.output.output_dir)
726+
report_path = output_dir / "evaluation_report.md"
727+
with open(report_path, "w", encoding="utf-8") as f:
728+
f.write(report)
729+
logger.info(f"Report saved to {report_path}")
730+
654731
def _display_results(self, result: EvaluationResult) -> None:
655732
"""Display evaluation results with formatted output."""
656733
endpoint_names = list(self.config.target_endpoints.keys())

0 commit comments

Comments
 (0)