diff --git a/cookbooks/zero_shot_evaluation/schema.py b/cookbooks/zero_shot_evaluation/schema.py index 62bd9370e..6fa1be3d3 100644 --- a/cookbooks/zero_shot_evaluation/schema.py +++ b/cookbooks/zero_shot_evaluation/schema.py @@ -9,7 +9,7 @@ import os import re from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union import yaml from loguru import logger @@ -92,6 +92,14 @@ class OutputConfig(BaseModel): output_dir: str = Field(default="./evaluation_results", description="Output directory") +class ReportConfig(BaseModel): + """Report generation configuration.""" + + enabled: bool = Field(default=False, description="Whether to generate report") + language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en") + include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section") + + class ZeroShotConfig(BaseModel): """Complete zero-shot evaluation configuration.""" @@ -101,6 +109,7 @@ class ZeroShotConfig(BaseModel): query_generation: QueryGenerationConfig = Field(default_factory=QueryGenerationConfig) evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig) output: OutputConfig = Field(default_factory=OutputConfig) + report: ReportConfig = Field(default_factory=ReportConfig) class GeneratedQuery(BaseModel): @@ -118,6 +127,20 @@ class QueryGenerationOutput(BaseModel): reason: str = Field(default="", description="Generation reasoning") +class ComparisonDetail(BaseModel): + """Single pairwise comparison detail.""" + + query: str = Field(..., description="Original query") + model_a: str = Field(..., description="Model A name") + model_b: str = Field(..., description="Model B name") + response_a: str = Field(..., description="Model A response") + response_b: str = Field(..., description="Model B response") + winner: str = Field(..., description="Winner: model_a | model_b") + score: float = Field(..., description="Score (1.0=A wins, 0.0=B wins)") + reason: str = Field(default="", description="Evaluation reason") + order: str = Field(default="original", description="Comparison order: original | swapped") + + # ============================================================================= # Configuration Loading # ============================================================================= diff --git a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py index 03f5e3d60..66ade84c2 100644 --- a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py +++ b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py @@ -26,6 +26,7 @@ from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector from cookbooks.zero_shot_evaluation.schema import ( + ComparisonDetail, GeneratedQuery, OpenAIEndpoint, ZeroShotConfig, @@ -36,7 +37,7 @@ from openjudge.analyzer import PairwiseAnalysisResult, PairwiseAnalyzer from openjudge.generator.simple_rubric import TaskBasedRubricGenerator from openjudge.graders.llm_grader import GraderMode, LLMGrader -from openjudge.graders.schema import GraderResult +from openjudge.graders.schema import GraderError, GraderResult from openjudge.models.openai_chat_model import OpenAIChatModel from openjudge.models.schema.oai.message import ChatMessage from openjudge.models.schema.prompt_template import PromptTemplate @@ -83,6 +84,7 @@ class _CheckpointManager: QUERIES_FILE = "queries.json" RESPONSES_FILE = "responses.json" RUBRICS_FILE = "rubrics.json" + DETAILS_FILE = "comparison_details.json" def __init__(self, output_dir: str): """Initialize checkpoint manager. @@ -194,6 +196,23 @@ def load_rubrics(self) -> List[str]: logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}") return rubrics + def save_comparison_details(self, details: List[ComparisonDetail]) -> str: + """Save comparison details.""" + file_path = self.output_dir / self.DETAILS_FILE + with open(file_path, "w", encoding="utf-8") as f: + json.dump([d.model_dump() for d in details], f, indent=2, ensure_ascii=False) + logger.info(f"Saved {len(details)} comparison details to {file_path}") + return str(file_path) + + def load_comparison_details(self) -> List[ComparisonDetail]: + """Load saved comparison details.""" + file_path = self.output_dir / self.DETAILS_FILE + if not file_path.exists(): + return [] + with open(file_path, "r", encoding="utf-8") as f: + data = json.load(f) + return [ComparisonDetail(**item) for item in data] + def update_stage( self, stage: EvaluationStage, @@ -217,6 +236,7 @@ def clear(self) -> None: self.QUERIES_FILE, self.RESPONSES_FILE, self.RUBRICS_FILE, + self.DETAILS_FILE, ]: file_path = self.output_dir / file_name if file_path.exists(): @@ -346,6 +366,7 @@ def __init__( self._queries: List[GeneratedQuery] = [] self._responses: List[Dict[str, Any]] = [] self._rubrics: List[str] = [] + self._comparison_details: List[ComparisonDetail] = [] # Initialize checkpoint manager self._checkpoint_mgr = _CheckpointManager(self.config.output.output_dir) @@ -527,8 +548,8 @@ async def _run_pairwise_evaluation( self, dataset: List[dict], rubrics: List[str], - ) -> List[GraderResult]: - """Run pairwise evaluation using GradingRunner.""" + ) -> Tuple[List[GraderResult], List[ComparisonDetail]]: + """Run pairwise evaluation and collect comparison details.""" grader = self._build_pairwise_grader(rubrics) mapper = { @@ -546,7 +567,31 @@ async def _run_pairwise_evaluation( logger.info(f"Running {len(dataset)} pairwise comparisons...") results = await runner.arun(dataset) - return results["pairwise"] + grader_results = results["pairwise"] + + # Collect comparison details (skip GraderError results) + details = [] + for sample, result in zip(dataset, grader_results): + if isinstance(result, GraderError): + continue + score = getattr(result, "score", None) + if score is None: + continue + details.append( + ComparisonDetail( + query=sample["evaluation_data"]["instruction"], + model_a=sample["metadata"]["model_a"], + model_b=sample["metadata"]["model_b"], + response_a=sample["evaluation_data"]["response_a"], + response_b=sample["evaluation_data"]["response_b"], + winner="model_a" if score >= 0.5 else "model_b", + score=score, + reason=getattr(result, "reason", ""), + order=sample["metadata"].get("order", "original"), + ) + ) + + return grader_results, details def _analyze_results( self, @@ -635,7 +680,10 @@ async def evaluate( if not dataset: raise ValueError("No valid comparison pairs. Check if responses were collected successfully.") - grader_results = await self._run_pairwise_evaluation(dataset, self._rubrics) + grader_results, self._comparison_details = await self._run_pairwise_evaluation(dataset, self._rubrics) + + # Save comparison details + self._checkpoint_mgr.save_comparison_details(self._comparison_details) # Step 5: Analyze results using OpenJudge's PairwiseAnalyzer logger.info("Step 5: Analyzing results...") @@ -649,8 +697,37 @@ async def evaluate( ) self._display_results(result) + + # Step 6: Generate report if enabled + if self.config.report.enabled: + await self._generate_and_save_report(result) + return result + async def _generate_and_save_report(self, result: EvaluationResult) -> None: + """Generate and save evaluation report.""" + from cookbooks.zero_shot_evaluation.report_generator import ReportGenerator + + logger.info("Step 6: Generating evaluation report...") + generator = ReportGenerator( + judge_endpoint=self.config.judge_endpoint, + language=self.config.report.language, + include_examples=self.config.report.include_examples, + ) + report = await generator.generate( + task_config=self.config.task, + rubrics=self._rubrics, + result=result, + details=self._comparison_details, + ) + + # Save report + output_dir = Path(self.config.output.output_dir) + report_path = output_dir / "evaluation_report.md" + with open(report_path, "w", encoding="utf-8") as f: + f.write(report) + logger.info(f"Report saved to {report_path}") + def _display_results(self, result: EvaluationResult) -> None: """Display evaluation results with formatted output.""" endpoint_names = list(self.config.target_endpoints.keys()) diff --git a/docs/applications/zero_shot_evaluation.md b/docs/applications/zero_shot_evaluation.md index c217fc7bd..4fa9f2d7a 100644 --- a/docs/applications/zero_shot_evaluation.md +++ b/docs/applications/zero_shot_evaluation.md @@ -3,31 +3,14 @@ Automatically evaluate and compare multiple models or AI agents without pre-existing test data. This end-to-end pipeline generates test queries, collects responses, and ranks models through pairwise comparison. -## When to Use +## Overview -Use zero-shot evaluation for: - -- **Model Comparison** — Compare different models on a specific task without preparing test data -- **Agent Pipeline Testing** — Evaluate different agent configurations or workflows -- **New Domain Evaluation** — Quickly assess model performance in new domains -- **Rapid Prototyping** — Get quick feedback on model quality during development - - -## How It Works - -Zero-shot evaluation automates the entire evaluation pipeline: - -1. **Generate Test Queries** — Create diverse, representative queries based on task description -2. **Collect Responses** — Query all target models/agents to collect responses -3. **Generate Rubrics** — Create evaluation criteria tailored to the task -4. **Pairwise Comparison** — Compare all response pairs using a judge model -5. **Rank Models** — Calculate win rates and produce final rankings +Zero-shot evaluation is ideal for **model comparison**, **agent pipeline testing**, **new domain evaluation**, and **rapid prototyping**—all without preparing test data upfront. !!! tip "No Test Data Required" Unlike traditional evaluation, zero-shot evaluation generates its own test queries from the task description, eliminating the need for pre-existing test datasets. - -## Five-Step Pipeline +The pipeline automates five steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → produce rankings. | Step | Component | Description | |------|-----------|-------------| @@ -40,73 +23,69 @@ Zero-shot evaluation automates the entire evaluation pipeline: ## Quick Start -### Using Configuration File (Recommended) +Get started with Zero-Shot Evaluation in just a few lines of code. Choose the approach that best fits your workflow: -```python -import asyncio -from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline - -async def main(): - pipeline = ZeroShotPipeline.from_config("config.yaml") - result = await pipeline.evaluate() +=== "Python API" - print(f"Best Model: {result.best_pipeline}") - for rank, (model, win_rate) in enumerate(result.rankings, 1): - print(f"{rank}. {model}: {win_rate:.1%}") + The recommended way to run evaluations programmatically: -asyncio.run(main()) -``` + ```python + import asyncio + from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline -### Using CLI + async def main(): + pipeline = ZeroShotPipeline.from_config("config.yaml") + result = await pipeline.evaluate() -```bash -# Run evaluation with config file -python -m cookbooks.zero_shot_evaluation --config config.yaml --save + print(f"Best Model: {result.best_pipeline}") + for rank, (model, win_rate) in enumerate(result.rankings, 1): + print(f"{rank}. {model}: {win_rate:.1%}") -# Resume from checkpoint (default behavior) -python -m cookbooks.zero_shot_evaluation --config config.yaml --save + asyncio.run(main()) + ``` -# Start fresh, ignore checkpoint -python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save +=== "CLI" -# Use pre-generated queries -python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save -``` + Run evaluations directly from the command line: -### Using Pre-defined Queries + ```bash + # Run evaluation with config file + python -m cookbooks.zero_shot_evaluation --config config.yaml --save -Skip query generation by providing your own queries file. This is useful when you want to evaluate models on a specific set of questions. + # Resume from checkpoint (default behavior) + python -m cookbooks.zero_shot_evaluation --config config.yaml --save -**Create a queries file** (`queries.json`): + # Start fresh, ignore checkpoint + python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save -```json -[ - {"query": "Translate: AI is transforming industries."}, - {"query": "Translate: The weather is nice today."}, - {"query": "Translate: How to learn programming effectively?"} -] -``` + # Use pre-generated queries + python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save + ``` -The `category` and `difficulty` fields are optional: +=== "Custom Queries" -```json -[ - {"query": "Your question here", "category": "general", "difficulty": "easy"} -] -``` + Skip query generation by providing your own queries file—useful when you want to evaluate models on a specific set of questions. -**Run evaluation**: + Create a `queries.json` file with your test cases: -```bash -python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save -``` + ```json + [ + {"query": "Translate: AI is transforming industries."}, + {"query": "Translate: The weather is nice today."}, + {"query": "Translate: How to learn programming effectively?"} + ] + ``` -The pipeline will skip query generation and directly use your queries for model comparison. + !!! tip "Optional Fields" + The `category` and `difficulty` fields are optional: `{"query": "...", "category": "general", "difficulty": "easy"}` + Then run the evaluation with your queries: -## Configuration + ```bash + python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save + ``` -Create a YAML configuration file to define your evaluation: +All methods require a YAML configuration file. Here's a complete example: ```yaml # Task description @@ -160,265 +139,205 @@ output: Use `${ENV_VAR}` syntax to reference environment variables for sensitive data like API keys. -## Step-by-Step Guide - -For fine-grained control, use individual components directly: - -### Step 1: Generate Test Queries - -```python -from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator -from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint - -# Configure task and endpoint -task = TaskConfig( - description="Code review assistant for Python", - scenario="Review code for bugs, style issues, and improvements" -) - -judge_endpoint = OpenAIEndpoint( - base_url="https://api.openai.com/v1", - api_key="your-api-key", - model="gpt-4" -) - -query_config = QueryGenerationConfig( - num_queries=20, - seed_queries=["Review this Python function for bugs..."], - enable_evolution=True, # Enable Evol-Instruct - evolution_rounds=1 -) - -generator = QueryGenerator(judge_endpoint, task, query_config) -queries = await generator.generate() -``` - -!!! info "Query Generation Features" - - **Parallel Batches**: Generates queries in parallel for diversity - - **Deduplication**: Automatically removes duplicate/similar queries - - **Evol-Instruct**: Optional complexity evolution for harder queries - - **Category Balancing**: Balance queries across specified categories - -### Step 2: Collect Responses - -```python -from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector -from cookbooks.zero_shot_evaluation.schema import EvaluationConfig +## Component Guide -collector = ResponseCollector( - target_endpoints={ - "model_a": endpoint_a, - "model_b": endpoint_b, - }, - evaluation_config=EvaluationConfig(max_concurrency=10) -) +For fine-grained control, use individual pipeline components directly. The workflow below shows how each component connects: -responses = await collector.collect(queries) -``` - -### Step 3: Generate Evaluation Rubrics +
+
Pipeline Components
-```python -from openjudge.generator.simple_rubric import TaskBasedRubricGenerator +
+
    +
  1. Generate Test Queries -rubric_gen = TaskBasedRubricGenerator( - model=judge_model, - task_description=task.description, - scenario=task.scenario, -) -rubrics = await rubric_gen.generate( - sample_queries=[q.query for q in queries[:5]] -) +Use `QueryGenerator` to create diverse test queries from your task description. Supports parallel generation, automatic deduplication, and optional Evol-Instruct complexity evolution.
  2. +
  3. Collect Responses -# Example output: -# - Accuracy: Whether the response is factually correct -# - Completeness: Whether the response fully addresses the query -# - Clarity: Whether the response is well-organized -``` +Use `ResponseCollector` to query all target models concurrently and gather their responses for comparison.
  4. +
  5. Generate Evaluation Rubrics -### Step 4: Run Full Evaluation +Use `TaskBasedRubricGenerator` to automatically create evaluation criteria (accuracy, completeness, clarity, etc.) tailored to your specific task.
  6. +
  7. Run Pairwise Evaluation -```python -from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline +Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all response pairs and producing final rankings.
  8. +
+
+
-pipeline = ZeroShotPipeline( - task_description="Code review assistant", - target_endpoints=target_endpoints, - judge_endpoint=judge_endpoint, - num_queries=20 -) +??? example "Code Examples for Each Step" -result = await pipeline.evaluate() -``` + **Step 1: Generate Test Queries** + ```python + from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator + from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint -## Understanding Results + task = TaskConfig( + description="Code review assistant for Python", + scenario="Review code for bugs, style issues, and improvements" + ) -The `EvaluationResult` provides comprehensive ranking statistics: + judge_endpoint = OpenAIEndpoint( + base_url="https://api.openai.com/v1", + api_key="your-api-key", + model="gpt-4" + ) -| Field | Type | Description | -|-------|------|-------------| -| `rankings` | `List[Tuple[str, float]]` | Models sorted by win rate (best first) | -| `win_rates` | `Dict[str, float]` | Win rate for each model (0.0-1.0) | -| `win_matrix` | `Dict[str, Dict[str, float]]` | Head-to-head win rates between models | -| `best_pipeline` | `str` | Model with highest win rate | -| `total_queries` | `int` | Total number of test queries | -| `total_comparisons` | `int` | Total number of pairwise comparisons | + query_config = QueryGenerationConfig( + num_queries=20, + seed_queries=["Review this Python function for bugs..."], + enable_evolution=True, + evolution_rounds=1 + ) -!!! example "Sample Output" + generator = QueryGenerator(judge_endpoint, task, query_config) + queries = await generator.generate() ``` - ============================================================ - ZERO-SHOT EVALUATION RESULTS - ============================================================ - Task: English to Chinese translation assistant... - Queries: 20 - Comparisons: 80 - - Rankings: - 1. qwen_candidate [################----] 80.0% - 2. gpt4_baseline [########------------] 40.0% - - Win Matrix (row vs column): - qwen_cand gpt4_base - qwen_candidate | -- 80.0% - gpt4_baseline | 20.0% -- - - Best Pipeline: qwen_candidate - ============================================================ - ``` - - -## Advanced Configuration - -### Query Generation Options - -| Option | Default | Description | -|--------|---------|-------------| -| `num_queries` | 20 | Total number of queries to generate | -| `queries_per_call` | 10 | Queries per API call (1-50) | -| `num_parallel_batches` | 3 | Number of parallel generation batches | -| `temperature` | 0.9 | Sampling temperature for diversity | -| `max_similarity` | 0.85 | Deduplication similarity threshold | -| `enable_evolution` | false | Enable Evol-Instruct complexity evolution | -| `evolution_rounds` | 1 | Number of evolution rounds (0-3) | - -### Evol-Instruct Evolution - -Enable complexity evolution to generate harder test queries: - -```yaml -query_generation: - enable_evolution: true - evolution_rounds: 2 - complexity_levels: - - "constraints" # Add specific constraints - - "reasoning" # Require multi-step reasoning - - "edge_cases" # Add edge cases and exceptions -``` - -!!! tip "Evolution Strategies" - - **constraints**: Add time, scope, or condition constraints - - **reasoning**: Require multi-step reasoning or comparison - - **edge_cases**: Include edge cases and unusual conditions - - -## Evaluation Report - -When enabled, the pipeline generates a comprehensive Markdown report explaining the evaluation results with concrete examples. The report is generated in parallel using the judge model. - -### Enabling Report Generation -```yaml -report: - enabled: true # Enable report generation - language: "zh" # Report language: "zh" (Chinese) or "en" (English) - include_examples: 3 # Number of examples per section (1-10) -``` - -### Report Sections - -The generated report includes four sections, each generated in parallel: + **Step 2: Collect Responses** -| Section | Description | -|---------|-------------| -| **Executive Summary** | Overview of evaluation purpose, methodology, and key findings | -| **Ranking Explanation** | Detailed analysis of why models are ranked in this order | -| **Model Analysis** | Per-model strengths, weaknesses, and improvement suggestions | -| **Representative Cases** | Concrete comparison examples with evaluation reasons | + ```python + from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector + from cookbooks.zero_shot_evaluation.schema import EvaluationConfig -### Report Options + collector = ResponseCollector( + target_endpoints={"model_a": endpoint_a, "model_b": endpoint_b}, + evaluation_config=EvaluationConfig(max_concurrency=10) + ) + responses = await collector.collect(queries) + ``` -| Option | Default | Description | -|--------|---------|-------------| -| `enabled` | `false` | Enable/disable report generation | -| `language` | `"zh"` | Report language: `"zh"` (Chinese) or `"en"` (English) | -| `include_examples` | `3` | Number of examples per section (1-10) | + **Step 3: Generate Evaluation Rubrics** -!!! example "Sample Report Structure" - ```markdown - # Evaluation Report + ```python + from openjudge.generator.simple_rubric import TaskBasedRubricGenerator - ## Executive Summary - This evaluation assessed the performance of mainstream LLMs on translation tasks... + rubric_gen = TaskBasedRubricGenerator( + model=judge_model, + task_description=task.description, + scenario=task.scenario, + ) + rubrics = await rubric_gen.generate(sample_queries=[q.query for q in queries[:5]]) + # Output: Accuracy, Completeness, Clarity criteria + ``` - ## Ranking Explanation - qwen-plus ranks first with a 67.9% win rate, primarily because... + **Step 4: Run Full Evaluation** - ## Model Analysis - ### qwen-plus - **Overall Assessment**: Best performer with highest win rate... - **Key Strengths**: High terminological accuracy, appropriate writing style... - **Improvement Suggestions**: Further optimize sentence variety... + ```python + from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline - ## Representative Cases - ### Case 1 - **Query:** Translate the following into English... - **Winner:** qwen-plus - **Evaluation Reason:** Response A uses more natural phrasing... + pipeline = ZeroShotPipeline( + task_description="Code review assistant", + target_endpoints=target_endpoints, + judge_endpoint=judge_endpoint, + num_queries=20 + ) + result = await pipeline.evaluate() ``` -!!! tip "Complete Example Report" - View a real evaluation report example: [Oncology Medical Translation Evaluation Report](sample_reports/oncology_translation_report.md) - This example demonstrates a complete report generated by Zero-Shot Evaluation, comparing three models (qwen-plus, qwen3-32b, qwen-turbo) on Chinese-to-English translation in the medical oncology domain. +## Advanced Topics + +=== "Understanding Results" + + The `EvaluationResult` provides comprehensive ranking statistics: + + | Field | Type | Description | + |-------|------|-------------| + | `rankings` | `List[Tuple[str, float]]` | Models sorted by win rate (best first) | + | `win_rates` | `Dict[str, float]` | Win rate for each model (0.0-1.0) | + | `win_matrix` | `Dict[str, Dict[str, float]]` | Head-to-head win rates between models | + | `best_pipeline` | `str` | Model with highest win rate | + | `total_queries` | `int` | Total number of test queries | + | `total_comparisons` | `int` | Total number of pairwise comparisons | + + !!! example "Sample Output" + ``` + ============================================================ + ZERO-SHOT EVALUATION RESULTS + ============================================================ + Task: English to Chinese translation assistant... + Queries: 20 | Comparisons: 80 + + Rankings: + 1. qwen_candidate [################----] 80.0% + 2. gpt4_baseline [########------------] 40.0% + + Best Pipeline: qwen_candidate + ============================================================ + ``` + +=== "Query Generation Options" + + Fine-tune query generation behavior: + + | Option | Default | Description | + |--------|---------|-------------| + | `num_queries` | 20 | Total number of queries to generate | + | `queries_per_call` | 10 | Queries per API call (1-50) | + | `num_parallel_batches` | 3 | Number of parallel generation batches | + | `temperature` | 0.9 | Sampling temperature for diversity | + | `max_similarity` | 0.85 | Deduplication similarity threshold | + | `enable_evolution` | false | Enable Evol-Instruct complexity evolution | + | `evolution_rounds` | 1 | Number of evolution rounds (0-3) | + + ??? tip "Enable Evol-Instruct for Harder Queries" + Evol-Instruct progressively increases query complexity: + + ```yaml + query_generation: + enable_evolution: true + evolution_rounds: 2 + complexity_levels: + - "constraints" # Add time, scope, or condition constraints + - "reasoning" # Require multi-step reasoning + - "edge_cases" # Include edge cases + ``` + +=== "Evaluation Report" + + Generate a comprehensive Markdown report with concrete examples: + + ```yaml + report: + enabled: true # Enable report generation + language: "zh" # "zh" (Chinese) or "en" (English) + include_examples: 3 # Examples per section (1-10) + ``` -### Output Files + The report includes **Executive Summary**, **Ranking Explanation**, **Model Analysis**, and **Representative Cases**. -When report generation is enabled, the following files are saved: + All results are saved to the output directory: -``` -evaluation_results/ -├── evaluation_report.md # Generated Markdown report -├── comparison_details.json # All pairwise comparison details -├── evaluation_results.json # Final rankings and statistics -├── queries.json # Generated test queries -├── responses.json # Model responses -└── rubrics.json # Evaluation criteria -``` + ``` + evaluation_results/ + ├── evaluation_report.md # Generated Markdown report + ├── comparison_details.json # All pairwise comparison details + ├── evaluation_results.json # Final rankings and statistics + ├── queries.json # Generated test queries + ├── responses.json # Model responses + └── rubrics.json # Evaluation criteria + ``` + !!! tip "Example Report" + View a real report: [Oncology Medical Translation Evaluation](sample_reports/oncology_translation_report.md) -## Checkpoint & Resume +=== "Checkpoint & Resume" -Evaluations automatically save checkpoints, allowing resumption after interruptions: + Evaluations automatically save checkpoints for resumption after interruptions: -```bash -# First run (interrupted) -python -m cookbooks.zero_shot_evaluation --config config.yaml --save -# Progress saved at: ./evaluation_results/checkpoint.json + ```bash + # First run (interrupted) + python -m cookbooks.zero_shot_evaluation --config config.yaml --save -# Resume from checkpoint (automatic) -python -m cookbooks.zero_shot_evaluation --config config.yaml --save -# Resumes from last completed step + # Resume from checkpoint (automatic) + python -m cookbooks.zero_shot_evaluation --config config.yaml --save -# Start fresh (ignore checkpoint) -python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save -``` + # Start fresh (ignore checkpoint) + python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save + ``` -!!! info "Checkpoint Stages" - 1. `QUERIES_GENERATED` — Test queries saved - 2. `RESPONSES_COLLECTED` — All responses saved - 3. `RUBRICS_GENERATED` — Evaluation rubrics saved - 4. `EVALUATION_COMPLETE` — Final results saved + Checkpoint stages: `QUERIES_GENERATED` → `RESPONSES_COLLECTED` → `RUBRICS_GENERATED` → `EVALUATION_COMPLETE` ## Best Practices @@ -436,12 +355,6 @@ python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save - Skip checkpoint resumption for long-running evaluations - Compare models with fundamentally different capabilities (e.g., text vs vision) - -## Next Steps - -- [Pairwise Evaluation](select_rank.md) — Compare models with pre-existing test data -- [Refine Data Quality](data_refinement.md) — Use grader feedback to improve outputs -- [Create Custom Graders](../building_graders/create_custom_graders.md) — Build specialized evaluation criteria -- [Run Grading Tasks](../running_graders/run_tasks.md) — Scale evaluations with GradingRunner +**Related Topics:** [Pairwise Evaluation](select_rank.md) · [Refine Data Quality](data_refinement.md) · [Create Custom Graders](../building_graders/create_custom_graders.md) · [Run Grading Tasks](../running_graders/run_tasks.md)