Skip to content

Commit 286a1e7

Browse files
committed
feat: add report generator and update zero-shot evaluation pipeline
- Add report_generator.py for generating evaluation reports - Update schema.py with new data structures - Update zero_shot_pipeline.py with report generation support - Add example evaluation report for oncology translation - Update zero_shot_evaluation documentation
1 parent cdd37c3 commit 286a1e7

File tree

5 files changed

+685
-7
lines changed

5 files changed

+685
-7
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
# -*- coding: utf-8 -*-
2+
"""Report generator for zero-shot evaluation results."""
3+
4+
import asyncio
5+
from typing import List
6+
7+
from loguru import logger
8+
9+
from cookbooks.zero_shot_evaluation.schema import (
10+
ComparisonDetail,
11+
OpenAIEndpoint,
12+
TaskConfig,
13+
)
14+
from cookbooks.zero_shot_evaluation.zero_shot_pipeline import EvaluationResult
15+
from openjudge.models.openai_chat_model import OpenAIChatModel
16+
17+
18+
class ReportGenerator:
19+
"""Generate evaluation report with parallel LLM calls."""
20+
21+
def __init__(
22+
self,
23+
judge_endpoint: OpenAIEndpoint,
24+
language: str = "zh",
25+
include_examples: int = 3,
26+
):
27+
self.language = language
28+
self.include_examples = include_examples
29+
extra_params = judge_endpoint.extra_params or {}
30+
self.model = OpenAIChatModel(
31+
model=judge_endpoint.model,
32+
api_key=judge_endpoint.api_key,
33+
base_url=judge_endpoint.base_url,
34+
temperature=extra_params.get("temperature", 0.3),
35+
)
36+
37+
async def generate(
38+
self,
39+
task_config: TaskConfig,
40+
rubrics: List[str],
41+
result: EvaluationResult,
42+
details: List[ComparisonDetail],
43+
) -> str:
44+
"""Generate complete report with parallel section generation."""
45+
# Prepare context
46+
ctx = self._prepare_context(task_config, rubrics, result, details)
47+
48+
# Generate sections in parallel
49+
sections = await asyncio.gather(
50+
self._gen_summary(ctx),
51+
self._gen_ranking_explanation(ctx),
52+
self._gen_model_analysis(ctx),
53+
self._gen_examples(ctx),
54+
)
55+
56+
# Assemble report
57+
lang_title = "评估报告" if self.language == "zh" else "Evaluation Report"
58+
header = f"# {lang_title}\n\n"
59+
return header + "\n\n---\n\n".join(s for s in sections if s)
60+
61+
def _prepare_context(
62+
self,
63+
task_config: TaskConfig,
64+
rubrics: List[str],
65+
result: EvaluationResult,
66+
details: List[ComparisonDetail],
67+
) -> dict:
68+
"""Prepare shared context for all sections."""
69+
# Filter to only original order (remove swapped duplicates)
70+
original_details = [d for d in details if d.order == "original"]
71+
72+
# Format rankings
73+
rankings_text = "\n".join(
74+
f"{i+1}. {name}: {rate:.1%}" for i, (name, rate) in enumerate(result.rankings)
75+
)
76+
77+
# Format rubrics
78+
rubrics_text = "\n".join(f"- {r}" for r in rubrics)
79+
80+
# Group details by model pair for examples
81+
model_examples = {}
82+
for d in original_details:
83+
key = tuple(sorted([d.model_a, d.model_b]))
84+
if key not in model_examples:
85+
model_examples[key] = []
86+
model_examples[key].append(d)
87+
88+
# Select representative examples (prefer those with detailed reasons)
89+
selected_examples = []
90+
for pair_details in model_examples.values():
91+
sorted_details = sorted(pair_details, key=lambda x: len(x.reason), reverse=True)
92+
selected_examples.extend(sorted_details[: self.include_examples])
93+
94+
return {
95+
"task_description": task_config.description,
96+
"scenario": task_config.scenario or "",
97+
"rubrics": rubrics_text,
98+
"rankings": rankings_text,
99+
"win_matrix": result.win_matrix,
100+
"total_queries": result.total_queries,
101+
"total_comparisons": result.total_comparisons,
102+
"best_model": result.best_pipeline,
103+
"model_names": [name for name, _ in result.rankings],
104+
"examples": selected_examples[: self.include_examples * 3],
105+
"all_details": original_details, # Use deduplicated details
106+
}
107+
108+
async def _call_llm(self, prompt: str) -> str:
109+
"""Call LLM with given prompt."""
110+
lang_instruction = (
111+
"Output in Chinese (中文)." if self.language == "zh" else "Output in English."
112+
)
113+
messages = [
114+
{"role": "system", "content": f"You are an expert AI evaluation analyst. {lang_instruction}"},
115+
{"role": "user", "content": prompt},
116+
]
117+
response = await self.model.achat(messages=messages)
118+
return response.content or ""
119+
120+
async def _gen_summary(self, ctx: dict) -> str:
121+
"""Generate executive summary."""
122+
prompt = f"""Generate a concise executive summary for an AI model evaluation.
123+
124+
Task: {ctx['task_description']}
125+
Scenario: {ctx['scenario']}
126+
127+
Evaluation Statistics:
128+
- Total test queries: {ctx['total_queries']}
129+
- Total pairwise comparisons: {ctx['total_comparisons']}
130+
131+
Final Rankings:
132+
{ctx['rankings']}
133+
134+
Best performing model: {ctx['best_model']}
135+
136+
Requirements:
137+
- Write 150-200 words
138+
- Include: evaluation purpose, methodology summary, key findings, winner
139+
- Use professional tone"""
140+
141+
content = await self._call_llm(prompt)
142+
title = "## 执行摘要" if self.language == "zh" else "## Executive Summary"
143+
return f"{title}\n\n{content}"
144+
145+
async def _gen_ranking_explanation(self, ctx: dict) -> str:
146+
"""Generate ranking explanation with evidence."""
147+
# Find key examples showing why top model won/lost
148+
best = ctx["best_model"]
149+
150+
# Best model wins: either (model_a=best and winner=model_a) or (model_b=best and winner=model_b)
151+
winning_examples = [
152+
d for d in ctx["all_details"]
153+
if (d.model_a == best and d.winner == "model_a") or (d.model_b == best and d.winner == "model_b")
154+
][:2]
155+
156+
# Best model loses: either (model_a=best and winner=model_b) or (model_b=best and winner=model_a)
157+
losing_examples = [
158+
d for d in ctx["all_details"]
159+
if (d.model_a == best and d.winner == "model_b") or (d.model_b == best and d.winner == "model_a")
160+
][:1]
161+
162+
examples_text = ""
163+
for i, ex in enumerate(winning_examples + losing_examples, 1):
164+
actual_winner = ex.model_a if ex.winner == "model_a" else ex.model_b
165+
examples_text += f"""
166+
Example {i}:
167+
- Query: {ex.query[:200]}...
168+
- Winner: {actual_winner}
169+
- Reason: {ex.reason}
170+
"""
171+
172+
prompt = f"""Explain why the models are ranked this way based on the evaluation.
173+
174+
Rankings:
175+
{ctx['rankings']}
176+
177+
Evaluation Criteria:
178+
{ctx['rubrics']}
179+
180+
Win Matrix (row beats column with this rate):
181+
{self._format_win_matrix(ctx['win_matrix'])}
182+
183+
Key Examples:
184+
{examples_text}
185+
186+
Requirements:
187+
- Explain why {ctx['best_model']} ranks first
188+
- Highlight key differences between top models
189+
- Reference specific evidence from examples
190+
- Be objective and balanced"""
191+
192+
content = await self._call_llm(prompt)
193+
title = "## 排名解释" if self.language == "zh" else "## Ranking Explanation"
194+
return f"{title}\n\n{content}"
195+
196+
async def _gen_model_analysis(self, ctx: dict) -> str:
197+
"""Generate per-model analysis."""
198+
# Collect stats for each model
199+
model_stats = {name: {"wins": 0, "losses": 0, "reasons": []} for name in ctx["model_names"]}
200+
201+
for d in ctx["all_details"]:
202+
winner = d.model_a if d.winner == "model_a" else d.model_b
203+
loser = d.model_b if d.winner == "model_a" else d.model_a
204+
model_stats[winner]["wins"] += 1
205+
model_stats[loser]["losses"] += 1
206+
if d.reason:
207+
model_stats[winner]["reasons"].append(f"[Win] {d.reason[:150]}")
208+
model_stats[loser]["reasons"].append(f"[Loss] {d.reason[:150]}")
209+
210+
stats_text = ""
211+
for name in ctx["model_names"]:
212+
stats = model_stats[name]
213+
sample_reasons = stats["reasons"][:3]
214+
stats_text += f"""
215+
Model: {name}
216+
- Wins: {stats['wins']}, Losses: {stats['losses']}
217+
- Sample evaluation reasons:
218+
{chr(10).join(' * ' + r for r in sample_reasons)}
219+
"""
220+
221+
prompt = f"""Analyze each model's performance in this evaluation.
222+
223+
Task: {ctx['task_description']}
224+
225+
Evaluation Criteria:
226+
{ctx['rubrics']}
227+
228+
Model Statistics:
229+
{stats_text}
230+
231+
Requirements:
232+
For each model, provide:
233+
1. Overall assessment (2-3 sentences)
234+
2. Key strengths (with evidence)
235+
3. Key weaknesses (with evidence)
236+
4. Improvement suggestions"""
237+
238+
content = await self._call_llm(prompt)
239+
title = "## 模型分析" if self.language == "zh" else "## Model Analysis"
240+
return f"{title}\n\n{content}"
241+
242+
async def _gen_examples(self, ctx: dict) -> str:
243+
"""Generate showcase examples."""
244+
examples = ctx["examples"][:5]
245+
if not examples:
246+
return ""
247+
248+
examples_text = ""
249+
for i, ex in enumerate(examples, 1):
250+
examples_text += f"""
251+
### Case {i}
252+
253+
**Query:** {ex.query}
254+
255+
**{ex.model_a}:**
256+
{ex.response_a[:500]}{'...' if len(ex.response_a) > 500 else ''}
257+
258+
**{ex.model_b}:**
259+
{ex.response_b[:500]}{'...' if len(ex.response_b) > 500 else ''}
260+
261+
**Winner:** {ex.model_a if ex.winner == 'model_a' else ex.model_b}
262+
263+
**Evaluation Reason:** {ex.reason}
264+
"""
265+
266+
title = "## 典型案例" if self.language == "zh" else "## Representative Cases"
267+
return f"{title}\n{examples_text}"
268+
269+
def _format_win_matrix(self, win_matrix: dict) -> str:
270+
"""Format win matrix for display."""
271+
lines = []
272+
for model_a, opponents in win_matrix.items():
273+
for model_b, rate in opponents.items():
274+
lines.append(f" {model_a} vs {model_b}: {rate:.1%}")
275+
return "\n".join(lines)
276+

cookbooks/zero_shot_evaluation/schema.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ class OutputConfig(BaseModel):
9292
output_dir: str = Field(default="./evaluation_results", description="Output directory")
9393

9494

95+
class ReportConfig(BaseModel):
96+
"""Report generation configuration."""
97+
98+
enabled: bool = Field(default=False, description="Whether to generate report")
99+
language: str = Field(default="zh", description="Report language: zh | en")
100+
include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section")
101+
102+
95103
class ZeroShotConfig(BaseModel):
96104
"""Complete zero-shot evaluation configuration."""
97105

@@ -101,6 +109,7 @@ class ZeroShotConfig(BaseModel):
101109
query_generation: QueryGenerationConfig = Field(default_factory=QueryGenerationConfig)
102110
evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig)
103111
output: OutputConfig = Field(default_factory=OutputConfig)
112+
report: ReportConfig = Field(default_factory=ReportConfig)
104113

105114

106115
class GeneratedQuery(BaseModel):
@@ -118,6 +127,20 @@ class QueryGenerationOutput(BaseModel):
118127
reason: str = Field(default="", description="Generation reasoning")
119128

120129

130+
class ComparisonDetail(BaseModel):
131+
"""Single pairwise comparison detail."""
132+
133+
query: str = Field(..., description="Original query")
134+
model_a: str = Field(..., description="Model A name")
135+
model_b: str = Field(..., description="Model B name")
136+
response_a: str = Field(..., description="Model A response")
137+
response_b: str = Field(..., description="Model B response")
138+
winner: str = Field(..., description="Winner: model_a | model_b")
139+
score: float = Field(..., description="Score (1.0=A wins, 0.0=B wins)")
140+
reason: str = Field(default="", description="Evaluation reason")
141+
order: str = Field(default="original", description="Comparison order: original | swapped")
142+
143+
121144
# =============================================================================
122145
# Configuration Loading
123146
# =============================================================================

0 commit comments

Comments
 (0)