diff --git a/cookbooks/zero_shot_evaluation/schema.py b/cookbooks/zero_shot_evaluation/schema.py index 62bd9370e..6fa1be3d3 100644 --- a/cookbooks/zero_shot_evaluation/schema.py +++ b/cookbooks/zero_shot_evaluation/schema.py @@ -9,7 +9,7 @@ import os import re from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union import yaml from loguru import logger @@ -92,6 +92,14 @@ class OutputConfig(BaseModel): output_dir: str = Field(default="./evaluation_results", description="Output directory") +class ReportConfig(BaseModel): + """Report generation configuration.""" + + enabled: bool = Field(default=False, description="Whether to generate report") + language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en") + include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section") + + class ZeroShotConfig(BaseModel): """Complete zero-shot evaluation configuration.""" @@ -101,6 +109,7 @@ class ZeroShotConfig(BaseModel): query_generation: QueryGenerationConfig = Field(default_factory=QueryGenerationConfig) evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig) output: OutputConfig = Field(default_factory=OutputConfig) + report: ReportConfig = Field(default_factory=ReportConfig) class GeneratedQuery(BaseModel): @@ -118,6 +127,20 @@ class QueryGenerationOutput(BaseModel): reason: str = Field(default="", description="Generation reasoning") +class ComparisonDetail(BaseModel): + """Single pairwise comparison detail.""" + + query: str = Field(..., description="Original query") + model_a: str = Field(..., description="Model A name") + model_b: str = Field(..., description="Model B name") + response_a: str = Field(..., description="Model A response") + response_b: str = Field(..., description="Model B response") + winner: str = Field(..., description="Winner: model_a | model_b") + score: float = Field(..., description="Score (1.0=A wins, 0.0=B wins)") + reason: str = Field(default="", description="Evaluation reason") + order: str = Field(default="original", description="Comparison order: original | swapped") + + # ============================================================================= # Configuration Loading # ============================================================================= diff --git a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py index 03f5e3d60..66ade84c2 100644 --- a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py +++ b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py @@ -26,6 +26,7 @@ from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector from cookbooks.zero_shot_evaluation.schema import ( + ComparisonDetail, GeneratedQuery, OpenAIEndpoint, ZeroShotConfig, @@ -36,7 +37,7 @@ from openjudge.analyzer import PairwiseAnalysisResult, PairwiseAnalyzer from openjudge.generator.simple_rubric import TaskBasedRubricGenerator from openjudge.graders.llm_grader import GraderMode, LLMGrader -from openjudge.graders.schema import GraderResult +from openjudge.graders.schema import GraderError, GraderResult from openjudge.models.openai_chat_model import OpenAIChatModel from openjudge.models.schema.oai.message import ChatMessage from openjudge.models.schema.prompt_template import PromptTemplate @@ -83,6 +84,7 @@ class _CheckpointManager: QUERIES_FILE = "queries.json" RESPONSES_FILE = "responses.json" RUBRICS_FILE = "rubrics.json" + DETAILS_FILE = "comparison_details.json" def __init__(self, output_dir: str): """Initialize checkpoint manager. @@ -194,6 +196,23 @@ def load_rubrics(self) -> List[str]: logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}") return rubrics + def save_comparison_details(self, details: List[ComparisonDetail]) -> str: + """Save comparison details.""" + file_path = self.output_dir / self.DETAILS_FILE + with open(file_path, "w", encoding="utf-8") as f: + json.dump([d.model_dump() for d in details], f, indent=2, ensure_ascii=False) + logger.info(f"Saved {len(details)} comparison details to {file_path}") + return str(file_path) + + def load_comparison_details(self) -> List[ComparisonDetail]: + """Load saved comparison details.""" + file_path = self.output_dir / self.DETAILS_FILE + if not file_path.exists(): + return [] + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + return [ComparisonDetail(**item) for item in data] + def update_stage( self, stage: EvaluationStage, @@ -217,6 +236,7 @@ def clear(self) -> None: self.QUERIES_FILE, self.RESPONSES_FILE, self.RUBRICS_FILE, + self.DETAILS_FILE, ]: file_path = self.output_dir / file_name if file_path.exists(): @@ -346,6 +366,7 @@ def __init__( self._queries: List[GeneratedQuery] = [] self._responses: List[Dict[str, Any]] = [] self._rubrics: List[str] = [] + self._comparison_details: List[ComparisonDetail] = [] # Initialize checkpoint manager self._checkpoint_mgr = _CheckpointManager(self.config.output.output_dir) @@ -527,8 +548,8 @@ async def _run_pairwise_evaluation( self, dataset: List[dict], rubrics: List[str], - ) -> List[GraderResult]: - """Run pairwise evaluation using GradingRunner.""" + ) -> Tuple[List[GraderResult], List[ComparisonDetail]]: + """Run pairwise evaluation and collect comparison details.""" grader = self._build_pairwise_grader(rubrics) mapper = { @@ -546,7 +567,31 @@ async def _run_pairwise_evaluation( logger.info(f"Running {len(dataset)} pairwise comparisons...") results = await runner.arun(dataset) - return results["pairwise"] + grader_results = results["pairwise"] + + # Collect comparison details (skip GraderError results) + details = [] + for sample, result in zip(dataset, grader_results): + if isinstance(result, GraderError): + continue + score = getattr(result, "score", None) + if score is None: + continue + details.append( + ComparisonDetail( + query=sample["evaluation_data"]["instruction"], + model_a=sample["metadata"]["model_a"], + model_b=sample["metadata"]["model_b"], + response_a=sample["evaluation_data"]["response_a"], + response_b=sample["evaluation_data"]["response_b"], + winner="model_a" if score >= 0.5 else "model_b", + score=score, + reason=getattr(result, "reason", ""), + order=sample["metadata"].get("order", "original"), + ) + ) + + return grader_results, details def _analyze_results( self, @@ -635,7 +680,10 @@ async def evaluate( if not dataset: raise ValueError("No valid comparison pairs. Check if responses were collected successfully.") - grader_results = await self._run_pairwise_evaluation(dataset, self._rubrics) + grader_results, self._comparison_details = await self._run_pairwise_evaluation(dataset, self._rubrics) + + # Save comparison details + self._checkpoint_mgr.save_comparison_details(self._comparison_details) # Step 5: Analyze results using OpenJudge's PairwiseAnalyzer logger.info("Step 5: Analyzing results...") @@ -649,8 +697,37 @@ async def evaluate( ) self._display_results(result) + + # Step 6: Generate report if enabled + if self.config.report.enabled: + await self._generate_and_save_report(result) + return result + async def _generate_and_save_report(self, result: EvaluationResult) -> None: + """Generate and save evaluation report.""" + from cookbooks.zero_shot_evaluation.report_generator import ReportGenerator + + logger.info("Step 6: Generating evaluation report...") + generator = ReportGenerator( + judge_endpoint=self.config.judge_endpoint, + language=self.config.report.language, + include_examples=self.config.report.include_examples, + ) + report = await generator.generate( + task_config=self.config.task, + rubrics=self._rubrics, + result=result, + details=self._comparison_details, + ) + + # Save report + output_dir = Path(self.config.output.output_dir) + report_path = output_dir / "evaluation_report.md" + with open(report_path, "w", encoding="utf-8") as f: + f.write(report) + logger.info(f"Report saved to {report_path}") + def _display_results(self, result: EvaluationResult) -> None: """Display evaluation results with formatted output.""" endpoint_names = list(self.config.target_endpoints.keys()) diff --git a/docs/applications/zero_shot_evaluation.md b/docs/applications/zero_shot_evaluation.md index c217fc7bd..4fa9f2d7a 100644 --- a/docs/applications/zero_shot_evaluation.md +++ b/docs/applications/zero_shot_evaluation.md @@ -3,31 +3,14 @@ Automatically evaluate and compare multiple models or AI agents without pre-existing test data. This end-to-end pipeline generates test queries, collects responses, and ranks models through pairwise comparison. -## When to Use +## Overview -Use zero-shot evaluation for: - -- **Model Comparison** — Compare different models on a specific task without preparing test data -- **Agent Pipeline Testing** — Evaluate different agent configurations or workflows -- **New Domain Evaluation** — Quickly assess model performance in new domains -- **Rapid Prototyping** — Get quick feedback on model quality during development - - -## How It Works - -Zero-shot evaluation automates the entire evaluation pipeline: - -1. **Generate Test Queries** — Create diverse, representative queries based on task description -2. **Collect Responses** — Query all target models/agents to collect responses -3. **Generate Rubrics** — Create evaluation criteria tailored to the task -4. **Pairwise Comparison** — Compare all response pairs using a judge model -5. **Rank Models** — Calculate win rates and produce final rankings +Zero-shot evaluation is ideal for **model comparison**, **agent pipeline testing**, **new domain evaluation**, and **rapid prototyping**—all without preparing test data upfront. !!! tip "No Test Data Required" Unlike traditional evaluation, zero-shot evaluation generates its own test queries from the task description, eliminating the need for pre-existing test datasets. - -## Five-Step Pipeline +The pipeline automates five steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → produce rankings. | Step | Component | Description | |------|-----------|-------------| @@ -40,73 +23,69 @@ Zero-shot evaluation automates the entire evaluation pipeline: ## Quick Start -### Using Configuration File (Recommended) +Get started with Zero-Shot Evaluation in just a few lines of code. Choose the approach that best fits your workflow: -```python -import asyncio -from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline - -async def main(): - pipeline = ZeroShotPipeline.from_config("config.yaml") - result = await pipeline.evaluate() +=== "Python API" - print(f"Best Model: {result.best_pipeline}") - for rank, (model, win_rate) in enumerate(result.rankings, 1): - print(f"{rank}. {model}: {win_rate:.1%}") + The recommended way to run evaluations programmatically: -asyncio.run(main()) -``` + ```python + import asyncio + from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline -### Using CLI + async def main(): + pipeline = ZeroShotPipeline.from_config("config.yaml") + result = await pipeline.evaluate() -```bash -# Run evaluation with config file -python -m cookbooks.zero_shot_evaluation --config config.yaml --save + print(f"Best Model: {result.best_pipeline}") + for rank, (model, win_rate) in enumerate(result.rankings, 1): + print(f"{rank}. {model}: {win_rate:.1%}") -# Resume from checkpoint (default behavior) -python -m cookbooks.zero_shot_evaluation --config config.yaml --save + asyncio.run(main()) + ``` -# Start fresh, ignore checkpoint -python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save +=== "CLI" -# Use pre-generated queries -python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save -``` + Run evaluations directly from the command line: -### Using Pre-defined Queries + ```bash + # Run evaluation with config file + python -m cookbooks.zero_shot_evaluation --config config.yaml --save -Skip query generation by providing your own queries file. This is useful when you want to evaluate models on a specific set of questions. + # Resume from checkpoint (default behavior) + python -m cookbooks.zero_shot_evaluation --config config.yaml --save -**Create a queries file** (`queries.json`): + # Start fresh, ignore checkpoint + python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save -```json -[ - {"query": "Translate: AI is transforming industries."}, - {"query": "Translate: The weather is nice today."}, - {"query": "Translate: How to learn programming effectively?"} -] -``` + # Use pre-generated queries + python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save + ``` -The `category` and `difficulty` fields are optional: +=== "Custom Queries" -```json -[ - {"query": "Your question here", "category": "general", "difficulty": "easy"} -] -``` + Skip query generation by providing your own queries file—useful when you want to evaluate models on a specific set of questions. -**Run evaluation**: + Create a `queries.json` file with your test cases: -```bash -python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save -``` + ```json + [ + {"query": "Translate: AI is transforming industries."}, + {"query": "Translate: The weather is nice today."}, + {"query": "Translate: How to learn programming effectively?"} + ] + ``` -The pipeline will skip query generation and directly use your queries for model comparison. + !!! tip "Optional Fields" + The `category` and `difficulty` fields are optional: `{"query": "...", "category": "general", "difficulty": "easy"}` + Then run the evaluation with your queries: -## Configuration + ```bash + python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save + ``` -Create a YAML configuration file to define your evaluation: +All methods require a YAML configuration file. Here's a complete example: ```yaml # Task description @@ -160,265 +139,205 @@ output: Use `${ENV_VAR}` syntax to reference environment variables for sensitive data like API keys. -## Step-by-Step Guide - -For fine-grained control, use individual components directly: - -### Step 1: Generate Test Queries - -```python -from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator -from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint - -# Configure task and endpoint -task = TaskConfig( - description="Code review assistant for Python", - scenario="Review code for bugs, style issues, and improvements" -) - -judge_endpoint = OpenAIEndpoint( - base_url="https://api.openai.com/v1", - api_key="your-api-key", - model="gpt-4" -) - -query_config = QueryGenerationConfig( - num_queries=20, - seed_queries=["Review this Python function for bugs..."], - enable_evolution=True, # Enable Evol-Instruct - evolution_rounds=1 -) - -generator = QueryGenerator(judge_endpoint, task, query_config) -queries = await generator.generate() -``` - -!!! info "Query Generation Features" - - **Parallel Batches**: Generates queries in parallel for diversity - - **Deduplication**: Automatically removes duplicate/similar queries - - **Evol-Instruct**: Optional complexity evolution for harder queries - - **Category Balancing**: Balance queries across specified categories - -### Step 2: Collect Responses - -```python -from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector -from cookbooks.zero_shot_evaluation.schema import EvaluationConfig +## Component Guide -collector = ResponseCollector( - target_endpoints={ - "model_a": endpoint_a, - "model_b": endpoint_b, - }, - evaluation_config=EvaluationConfig(max_concurrency=10) -) +For fine-grained control, use individual pipeline components directly. The workflow below shows how each component connects: -responses = await collector.collect(queries) -``` - -### Step 3: Generate Evaluation Rubrics +