diff --git a/cookbooks/zero_shot_evaluation/schema.py b/cookbooks/zero_shot_evaluation/schema.py
index 62bd9370e..6fa1be3d3 100644
--- a/cookbooks/zero_shot_evaluation/schema.py
+++ b/cookbooks/zero_shot_evaluation/schema.py
@@ -9,7 +9,7 @@
 import os
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import yaml
 from loguru import logger
@@ -92,6 +92,14 @@ class OutputConfig(BaseModel):
     output_dir: str = Field(default="./evaluation_results", description="Output directory")
 
 
+class ReportConfig(BaseModel):
+    """Report generation configuration."""
+
+    enabled: bool = Field(default=False, description="Whether to generate report")
+    language: Literal["zh", "en"] = Field(default="zh", description="Report language: zh | en")
+    include_examples: int = Field(default=3, ge=1, le=10, description="Examples per section")
+
+
 class ZeroShotConfig(BaseModel):
     """Complete zero-shot evaluation configuration."""
 
@@ -101,6 +109,7 @@ class ZeroShotConfig(BaseModel):
     query_generation: QueryGenerationConfig = Field(default_factory=QueryGenerationConfig)
     evaluation: EvaluationConfig = Field(default_factory=EvaluationConfig)
     output: OutputConfig = Field(default_factory=OutputConfig)
+    report: ReportConfig = Field(default_factory=ReportConfig)
 
 
 class GeneratedQuery(BaseModel):
@@ -118,6 +127,20 @@ class QueryGenerationOutput(BaseModel):
     reason: str = Field(default="", description="Generation reasoning")
 
 
+class ComparisonDetail(BaseModel):
+    """Single pairwise comparison detail."""
+
+    query: str = Field(..., description="Original query")
+    model_a: str = Field(..., description="Model A name")
+    model_b: str = Field(..., description="Model B name")
+    response_a: str = Field(..., description="Model A response")
+    response_b: str = Field(..., description="Model B response")
+    winner: str = Field(..., description="Winner: model_a | model_b")
+    score: float = Field(..., description="Score (1.0=A wins, 0.0=B wins)")
+    reason: str = Field(default="", description="Evaluation reason")
+    order: str = Field(default="original", description="Comparison order: original | swapped")
+
+
 # =============================================================================
 # Configuration Loading
 # =============================================================================
diff --git a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
index 03f5e3d60..66ade84c2 100644
--- a/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
+++ b/cookbooks/zero_shot_evaluation/zero_shot_pipeline.py
@@ -26,6 +26,7 @@
 from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
 from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
 from cookbooks.zero_shot_evaluation.schema import (
+    ComparisonDetail,
     GeneratedQuery,
     OpenAIEndpoint,
     ZeroShotConfig,
@@ -36,7 +37,7 @@
 from openjudge.analyzer import PairwiseAnalysisResult, PairwiseAnalyzer
 from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
 from openjudge.graders.llm_grader import GraderMode, LLMGrader
-from openjudge.graders.schema import GraderResult
+from openjudge.graders.schema import GraderError, GraderResult
 from openjudge.models.openai_chat_model import OpenAIChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import PromptTemplate
@@ -83,6 +84,7 @@ class _CheckpointManager:
     QUERIES_FILE = "queries.json"
     RESPONSES_FILE = "responses.json"
     RUBRICS_FILE = "rubrics.json"
+    DETAILS_FILE = "comparison_details.json"
 
     def __init__(self, output_dir: str):
         """Initialize checkpoint manager.
@@ -194,6 +196,23 @@ def load_rubrics(self) -> List[str]:
         logger.info(f"Loaded {len(rubrics)} rubrics from {file_path}")
         return rubrics
 
+    def save_comparison_details(self, details: List[ComparisonDetail]) -> str:
+        """Save comparison details."""
+        file_path = self.output_dir / self.DETAILS_FILE
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump([d.model_dump() for d in details], f, indent=2, ensure_ascii=False)
+        logger.info(f"Saved {len(details)} comparison details to {file_path}")
+        return str(file_path)
+
+    def load_comparison_details(self) -> List[ComparisonDetail]:
+        """Load saved comparison details."""
+        file_path = self.output_dir / self.DETAILS_FILE
+        if not file_path.exists():
+            return []
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        return [ComparisonDetail(**item) for item in data]
+
     def update_stage(
         self,
         stage: EvaluationStage,
@@ -217,6 +236,7 @@ def clear(self) -> None:
             self.QUERIES_FILE,
             self.RESPONSES_FILE,
             self.RUBRICS_FILE,
+            self.DETAILS_FILE,
         ]:
             file_path = self.output_dir / file_name
             if file_path.exists():
@@ -346,6 +366,7 @@ def __init__(
         self._queries: List[GeneratedQuery] = []
         self._responses: List[Dict[str, Any]] = []
         self._rubrics: List[str] = []
+        self._comparison_details: List[ComparisonDetail] = []
 
         # Initialize checkpoint manager
         self._checkpoint_mgr = _CheckpointManager(self.config.output.output_dir)
@@ -527,8 +548,8 @@ async def _run_pairwise_evaluation(
         self,
         dataset: List[dict],
         rubrics: List[str],
-    ) -> List[GraderResult]:
-        """Run pairwise evaluation using GradingRunner."""
+    ) -> Tuple[List[GraderResult], List[ComparisonDetail]]:
+        """Run pairwise evaluation and collect comparison details."""
         grader = self._build_pairwise_grader(rubrics)
 
         mapper = {
@@ -546,7 +567,31 @@ async def _run_pairwise_evaluation(
 
         logger.info(f"Running {len(dataset)} pairwise comparisons...")
         results = await runner.arun(dataset)
-        return results["pairwise"]
+        grader_results = results["pairwise"]
+
+        # Collect comparison details (skip GraderError results)
+        details = []
+        for sample, result in zip(dataset, grader_results):
+            if isinstance(result, GraderError):
+                continue
+            score = getattr(result, "score", None)
+            if score is None:
+                continue
+            details.append(
+                ComparisonDetail(
+                    query=sample["evaluation_data"]["instruction"],
+                    model_a=sample["metadata"]["model_a"],
+                    model_b=sample["metadata"]["model_b"],
+                    response_a=sample["evaluation_data"]["response_a"],
+                    response_b=sample["evaluation_data"]["response_b"],
+                    winner="model_a" if score >= 0.5 else "model_b",
+                    score=score,
+                    reason=getattr(result, "reason", ""),
+                    order=sample["metadata"].get("order", "original"),
+                )
+            )
+
+        return grader_results, details
 
     def _analyze_results(
         self,
@@ -635,7 +680,10 @@ async def evaluate(
         if not dataset:
             raise ValueError("No valid comparison pairs. Check if responses were collected successfully.")
 
-        grader_results = await self._run_pairwise_evaluation(dataset, self._rubrics)
+        grader_results, self._comparison_details = await self._run_pairwise_evaluation(dataset, self._rubrics)
+
+        # Save comparison details
+        self._checkpoint_mgr.save_comparison_details(self._comparison_details)
 
         # Step 5: Analyze results using OpenJudge's PairwiseAnalyzer
         logger.info("Step 5: Analyzing results...")
@@ -649,8 +697,37 @@ async def evaluate(
         )
 
         self._display_results(result)
+
+        # Step 6: Generate report if enabled
+        if self.config.report.enabled:
+            await self._generate_and_save_report(result)
+
         return result
 
+    async def _generate_and_save_report(self, result: EvaluationResult) -> None:
+        """Generate and save evaluation report."""
+        from cookbooks.zero_shot_evaluation.report_generator import ReportGenerator
+
+        logger.info("Step 6: Generating evaluation report...")
+        generator = ReportGenerator(
+            judge_endpoint=self.config.judge_endpoint,
+            language=self.config.report.language,
+            include_examples=self.config.report.include_examples,
+        )
+        report = await generator.generate(
+            task_config=self.config.task,
+            rubrics=self._rubrics,
+            result=result,
+            details=self._comparison_details,
+        )
+
+        # Save report
+        output_dir = Path(self.config.output.output_dir)
+        report_path = output_dir / "evaluation_report.md"
+        with open(report_path, "w", encoding="utf-8") as f:
+            f.write(report)
+        logger.info(f"Report saved to {report_path}")
+
     def _display_results(self, result: EvaluationResult) -> None:
         """Display evaluation results with formatted output."""
         endpoint_names = list(self.config.target_endpoints.keys())
diff --git a/docs/applications/zero_shot_evaluation.md b/docs/applications/zero_shot_evaluation.md
index c217fc7bd..4fa9f2d7a 100644
--- a/docs/applications/zero_shot_evaluation.md
+++ b/docs/applications/zero_shot_evaluation.md
@@ -3,31 +3,14 @@
 Automatically evaluate and compare multiple models or AI agents without pre-existing test data. This end-to-end pipeline generates test queries, collects responses, and ranks models through pairwise comparison.
 
 
-## When to Use
+## Overview
 
-Use zero-shot evaluation for:
-
-- **Model Comparison** — Compare different models on a specific task without preparing test data
-- **Agent Pipeline Testing** — Evaluate different agent configurations or workflows
-- **New Domain Evaluation** — Quickly assess model performance in new domains
-- **Rapid Prototyping** — Get quick feedback on model quality during development
-
-
-## How It Works
-
-Zero-shot evaluation automates the entire evaluation pipeline:
-
-1. **Generate Test Queries** — Create diverse, representative queries based on task description
-2. **Collect Responses** — Query all target models/agents to collect responses
-3. **Generate Rubrics** — Create evaluation criteria tailored to the task
-4. **Pairwise Comparison** — Compare all response pairs using a judge model
-5. **Rank Models** — Calculate win rates and produce final rankings
+Zero-shot evaluation is ideal for **model comparison**, **agent pipeline testing**, **new domain evaluation**, and **rapid prototyping**—all without preparing test data upfront.
 
 !!! tip "No Test Data Required"
     Unlike traditional evaluation, zero-shot evaluation generates its own test queries from the task description, eliminating the need for pre-existing test datasets.
 
-
-## Five-Step Pipeline
+The pipeline automates five steps: generate test queries → collect responses → create evaluation rubrics → run pairwise comparisons → produce rankings.
 
 | Step | Component | Description |
 |------|-----------|-------------|
@@ -40,73 +23,69 @@ Zero-shot evaluation automates the entire evaluation pipeline:
 
 ## Quick Start
 
-### Using Configuration File (Recommended)
+Get started with Zero-Shot Evaluation in just a few lines of code. Choose the approach that best fits your workflow:
 
-```python
-import asyncio
-from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
-
-async def main():
-    pipeline = ZeroShotPipeline.from_config("config.yaml")
-    result = await pipeline.evaluate()
+=== "Python API"
 
-    print(f"Best Model: {result.best_pipeline}")
-    for rank, (model, win_rate) in enumerate(result.rankings, 1):
-        print(f"{rank}. {model}: {win_rate:.1%}")
+    The recommended way to run evaluations programmatically:
 
-asyncio.run(main())
-```
+    ```python
+    import asyncio
+    from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
 
-### Using CLI
+    async def main():
+        pipeline = ZeroShotPipeline.from_config("config.yaml")
+        result = await pipeline.evaluate()
 
-```bash
-# Run evaluation with config file
-python -m cookbooks.zero_shot_evaluation --config config.yaml --save
+        print(f"Best Model: {result.best_pipeline}")
+        for rank, (model, win_rate) in enumerate(result.rankings, 1):
+            print(f"{rank}. {model}: {win_rate:.1%}")
 
-# Resume from checkpoint (default behavior)
-python -m cookbooks.zero_shot_evaluation --config config.yaml --save
+    asyncio.run(main())
+    ```
 
-# Start fresh, ignore checkpoint
-python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
+=== "CLI"
 
-# Use pre-generated queries
-python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
-```
+    Run evaluations directly from the command line:
 
-### Using Pre-defined Queries
+    ```bash
+    # Run evaluation with config file
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --save
 
-Skip query generation by providing your own queries file. This is useful when you want to evaluate models on a specific set of questions.
+    # Resume from checkpoint (default behavior)
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --save
 
-**Create a queries file** (`queries.json`):
+    # Start fresh, ignore checkpoint
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
 
-```json
-[
-  {"query": "Translate: AI is transforming industries."},
-  {"query": "Translate: The weather is nice today."},
-  {"query": "Translate: How to learn programming effectively?"}
-]
-```
+    # Use pre-generated queries
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
+    ```
 
-The `category` and `difficulty` fields are optional:
+=== "Custom Queries"
 
-```json
-[
-  {"query": "Your question here", "category": "general", "difficulty": "easy"}
-]
-```
+    Skip query generation by providing your own queries file—useful when you want to evaluate models on a specific set of questions.
 
-**Run evaluation**:
+    Create a `queries.json` file with your test cases:
 
-```bash
-python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
-```
+    ```json
+    [
+      {"query": "Translate: AI is transforming industries."},
+      {"query": "Translate: The weather is nice today."},
+      {"query": "Translate: How to learn programming effectively?"}
+    ]
+    ```
 
-The pipeline will skip query generation and directly use your queries for model comparison.
+    !!! tip "Optional Fields"
+        The `category` and `difficulty` fields are optional: `{"query": "...", "category": "general", "difficulty": "easy"}`
 
+    Then run the evaluation with your queries:
 
-## Configuration
+    ```bash
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
+    ```
 
-Create a YAML configuration file to define your evaluation:
+All methods require a YAML configuration file. Here's a complete example:
 
 ```yaml
 # Task description
@@ -160,265 +139,205 @@ output:
     Use `${ENV_VAR}` syntax to reference environment variables for sensitive data like API keys.
 
 
-## Step-by-Step Guide
-
-For fine-grained control, use individual components directly:
-
-### Step 1: Generate Test Queries
-
-```python
-from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
-from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint
-
-# Configure task and endpoint
-task = TaskConfig(
-    description="Code review assistant for Python",
-    scenario="Review code for bugs, style issues, and improvements"
-)
-
-judge_endpoint = OpenAIEndpoint(
-    base_url="https://api.openai.com/v1",
-    api_key="your-api-key",
-    model="gpt-4"
-)
-
-query_config = QueryGenerationConfig(
-    num_queries=20,
-    seed_queries=["Review this Python function for bugs..."],
-    enable_evolution=True,  # Enable Evol-Instruct
-    evolution_rounds=1
-)
-
-generator = QueryGenerator(judge_endpoint, task, query_config)
-queries = await generator.generate()
-```
-
-!!! info "Query Generation Features"
-    - **Parallel Batches**: Generates queries in parallel for diversity
-    - **Deduplication**: Automatically removes duplicate/similar queries
-    - **Evol-Instruct**: Optional complexity evolution for harder queries
-    - **Category Balancing**: Balance queries across specified categories
-
-### Step 2: Collect Responses
-
-```python
-from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
-from cookbooks.zero_shot_evaluation.schema import EvaluationConfig
+## Component Guide
 
-collector = ResponseCollector(
-    target_endpoints={
-        "model_a": endpoint_a,
-        "model_b": endpoint_b,
-    },
-    evaluation_config=EvaluationConfig(max_concurrency=10)
-)
+For fine-grained control, use individual pipeline components directly. The workflow below shows how each component connects:
 
-responses = await collector.collect(queries)
-```
-
-### Step 3: Generate Evaluation Rubrics
+<div class="workflow-single">
+<div class="workflow-header">Pipeline Components</div>
 
-```python
-from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
+<div class="workflow">
+<ol class="workflow-steps">
+<li><strong>Generate Test Queries</strong>
 
-rubric_gen = TaskBasedRubricGenerator(
-    model=judge_model,
-    task_description=task.description,
-    scenario=task.scenario,
-)
-rubrics = await rubric_gen.generate(
-    sample_queries=[q.query for q in queries[:5]]
-)
+Use `QueryGenerator` to create diverse test queries from your task description. Supports parallel generation, automatic deduplication, and optional Evol-Instruct complexity evolution.</li>
+<li><strong>Collect Responses</strong>
 
-# Example output:
-# - Accuracy: Whether the response is factually correct
-# - Completeness: Whether the response fully addresses the query
-# - Clarity: Whether the response is well-organized
-```
+Use `ResponseCollector` to query all target models concurrently and gather their responses for comparison.</li>
+<li><strong>Generate Evaluation Rubrics</strong>
 
-### Step 4: Run Full Evaluation
+Use `TaskBasedRubricGenerator` to automatically create evaluation criteria (accuracy, completeness, clarity, etc.) tailored to your specific task.</li>
+<li><strong>Run Pairwise Evaluation</strong>
 
-```python
-from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
+Use `ZeroShotPipeline` to orchestrate the full evaluation, comparing all response pairs and producing final rankings.</li>
+</ol>
+</div>
+</div>
 
-pipeline = ZeroShotPipeline(
-    task_description="Code review assistant",
-    target_endpoints=target_endpoints,
-    judge_endpoint=judge_endpoint,
-    num_queries=20
-)
+??? example "Code Examples for Each Step"
 
-result = await pipeline.evaluate()
-```
+    **Step 1: Generate Test Queries**
 
+    ```python
+    from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
+    from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint
 
-## Understanding Results
+    task = TaskConfig(
+        description="Code review assistant for Python",
+        scenario="Review code for bugs, style issues, and improvements"
+    )
 
-The `EvaluationResult` provides comprehensive ranking statistics:
+    judge_endpoint = OpenAIEndpoint(
+        base_url="https://api.openai.com/v1",
+        api_key="your-api-key",
+        model="gpt-4"
+    )
 
-| Field | Type | Description |
-|-------|------|-------------|
-| `rankings` | `List[Tuple[str, float]]` | Models sorted by win rate (best first) |
-| `win_rates` | `Dict[str, float]` | Win rate for each model (0.0-1.0) |
-| `win_matrix` | `Dict[str, Dict[str, float]]` | Head-to-head win rates between models |
-| `best_pipeline` | `str` | Model with highest win rate |
-| `total_queries` | `int` | Total number of test queries |
-| `total_comparisons` | `int` | Total number of pairwise comparisons |
+    query_config = QueryGenerationConfig(
+        num_queries=20,
+        seed_queries=["Review this Python function for bugs..."],
+        enable_evolution=True,
+        evolution_rounds=1
+    )
 
-!!! example "Sample Output"
+    generator = QueryGenerator(judge_endpoint, task, query_config)
+    queries = await generator.generate()
     ```
-    ============================================================
-    ZERO-SHOT EVALUATION RESULTS
-    ============================================================
-    Task: English to Chinese translation assistant...
-    Queries: 20
-    Comparisons: 80
-
-    Rankings:
-      1. qwen_candidate      [################----] 80.0%
-      2. gpt4_baseline       [########------------] 40.0%
-
-    Win Matrix (row vs column):
-                     qwen_cand  gpt4_base
-      qwen_candidate | --        80.0%
-      gpt4_baseline  | 20.0%     --
-
-    Best Pipeline: qwen_candidate
-    ============================================================
-    ```
-
-
-## Advanced Configuration
-
-### Query Generation Options
-
-| Option | Default | Description |
-|--------|---------|-------------|
-| `num_queries` | 20 | Total number of queries to generate |
-| `queries_per_call` | 10 | Queries per API call (1-50) |
-| `num_parallel_batches` | 3 | Number of parallel generation batches |
-| `temperature` | 0.9 | Sampling temperature for diversity |
-| `max_similarity` | 0.85 | Deduplication similarity threshold |
-| `enable_evolution` | false | Enable Evol-Instruct complexity evolution |
-| `evolution_rounds` | 1 | Number of evolution rounds (0-3) |
-
-### Evol-Instruct Evolution
-
-Enable complexity evolution to generate harder test queries:
-
-```yaml
-query_generation:
-  enable_evolution: true
-  evolution_rounds: 2
-  complexity_levels:
-    - "constraints"    # Add specific constraints
-    - "reasoning"      # Require multi-step reasoning
-    - "edge_cases"     # Add edge cases and exceptions
-```
-
-!!! tip "Evolution Strategies"
-    - **constraints**: Add time, scope, or condition constraints
-    - **reasoning**: Require multi-step reasoning or comparison
-    - **edge_cases**: Include edge cases and unusual conditions
-
-
-## Evaluation Report
-
-When enabled, the pipeline generates a comprehensive Markdown report explaining the evaluation results with concrete examples. The report is generated in parallel using the judge model.
-
-### Enabling Report Generation
 
-```yaml
-report:
-  enabled: true        # Enable report generation
-  language: "zh"       # Report language: "zh" (Chinese) or "en" (English)
-  include_examples: 3  # Number of examples per section (1-10)
-```
-
-### Report Sections
-
-The generated report includes four sections, each generated in parallel:
+    **Step 2: Collect Responses**
 
-| Section | Description |
-|---------|-------------|
-| **Executive Summary** | Overview of evaluation purpose, methodology, and key findings |
-| **Ranking Explanation** | Detailed analysis of why models are ranked in this order |
-| **Model Analysis** | Per-model strengths, weaknesses, and improvement suggestions |
-| **Representative Cases** | Concrete comparison examples with evaluation reasons |
+    ```python
+    from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
+    from cookbooks.zero_shot_evaluation.schema import EvaluationConfig
 
-### Report Options
+    collector = ResponseCollector(
+        target_endpoints={"model_a": endpoint_a, "model_b": endpoint_b},
+        evaluation_config=EvaluationConfig(max_concurrency=10)
+    )
+    responses = await collector.collect(queries)
+    ```
 
-| Option | Default | Description |
-|--------|---------|-------------|
-| `enabled` | `false` | Enable/disable report generation |
-| `language` | `"zh"` | Report language: `"zh"` (Chinese) or `"en"` (English) |
-| `include_examples` | `3` | Number of examples per section (1-10) |
+    **Step 3: Generate Evaluation Rubrics**
 
-!!! example "Sample Report Structure"
-    ```markdown
-    # Evaluation Report
+    ```python
+    from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
 
-    ## Executive Summary
-    This evaluation assessed the performance of mainstream LLMs on translation tasks...
+    rubric_gen = TaskBasedRubricGenerator(
+        model=judge_model,
+        task_description=task.description,
+        scenario=task.scenario,
+    )
+    rubrics = await rubric_gen.generate(sample_queries=[q.query for q in queries[:5]])
+    # Output: Accuracy, Completeness, Clarity criteria
+    ```
 
-    ## Ranking Explanation
-    qwen-plus ranks first with a 67.9% win rate, primarily because...
+    **Step 4: Run Full Evaluation**
 
-    ## Model Analysis
-    ### qwen-plus
-    **Overall Assessment**: Best performer with highest win rate...
-    **Key Strengths**: High terminological accuracy, appropriate writing style...
-    **Improvement Suggestions**: Further optimize sentence variety...
+    ```python
+    from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
 
-    ## Representative Cases
-    ### Case 1
-    **Query:** Translate the following into English...
-    **Winner:** qwen-plus
-    **Evaluation Reason:** Response A uses more natural phrasing...
+    pipeline = ZeroShotPipeline(
+        task_description="Code review assistant",
+        target_endpoints=target_endpoints,
+        judge_endpoint=judge_endpoint,
+        num_queries=20
+    )
+    result = await pipeline.evaluate()
     ```
 
-!!! tip "Complete Example Report"
-    View a real evaluation report example: [Oncology Medical Translation Evaluation Report](sample_reports/oncology_translation_report.md)
 
-    This example demonstrates a complete report generated by Zero-Shot Evaluation, comparing three models (qwen-plus, qwen3-32b, qwen-turbo) on Chinese-to-English translation in the medical oncology domain.
+## Advanced Topics
+
+=== "Understanding Results"
+
+    The `EvaluationResult` provides comprehensive ranking statistics:
+
+    | Field | Type | Description |
+    |-------|------|-------------|
+    | `rankings` | `List[Tuple[str, float]]` | Models sorted by win rate (best first) |
+    | `win_rates` | `Dict[str, float]` | Win rate for each model (0.0-1.0) |
+    | `win_matrix` | `Dict[str, Dict[str, float]]` | Head-to-head win rates between models |
+    | `best_pipeline` | `str` | Model with highest win rate |
+    | `total_queries` | `int` | Total number of test queries |
+    | `total_comparisons` | `int` | Total number of pairwise comparisons |
+
+    !!! example "Sample Output"
+        ```
+        ============================================================
+        ZERO-SHOT EVALUATION RESULTS
+        ============================================================
+        Task: English to Chinese translation assistant...
+        Queries: 20 | Comparisons: 80
+
+        Rankings:
+          1. qwen_candidate      [################----] 80.0%
+          2. gpt4_baseline       [########------------] 40.0%
+
+        Best Pipeline: qwen_candidate
+        ============================================================
+        ```
+
+=== "Query Generation Options"
+
+    Fine-tune query generation behavior:
+
+    | Option | Default | Description |
+    |--------|---------|-------------|
+    | `num_queries` | 20 | Total number of queries to generate |
+    | `queries_per_call` | 10 | Queries per API call (1-50) |
+    | `num_parallel_batches` | 3 | Number of parallel generation batches |
+    | `temperature` | 0.9 | Sampling temperature for diversity |
+    | `max_similarity` | 0.85 | Deduplication similarity threshold |
+    | `enable_evolution` | false | Enable Evol-Instruct complexity evolution |
+    | `evolution_rounds` | 1 | Number of evolution rounds (0-3) |
+
+    ??? tip "Enable Evol-Instruct for Harder Queries"
+        Evol-Instruct progressively increases query complexity:
+
+        ```yaml
+        query_generation:
+          enable_evolution: true
+          evolution_rounds: 2
+          complexity_levels:
+            - "constraints"    # Add time, scope, or condition constraints
+            - "reasoning"      # Require multi-step reasoning
+            - "edge_cases"     # Include edge cases
+        ```
+
+=== "Evaluation Report"
+
+    Generate a comprehensive Markdown report with concrete examples:
+
+    ```yaml
+    report:
+      enabled: true        # Enable report generation
+      language: "zh"       # "zh" (Chinese) or "en" (English)
+      include_examples: 3  # Examples per section (1-10)
+    ```
 
-### Output Files
+    The report includes **Executive Summary**, **Ranking Explanation**, **Model Analysis**, and **Representative Cases**.
 
-When report generation is enabled, the following files are saved:
+    All results are saved to the output directory:
 
-```
-evaluation_results/
-├── evaluation_report.md      # Generated Markdown report
-├── comparison_details.json   # All pairwise comparison details
-├── evaluation_results.json   # Final rankings and statistics
-├── queries.json              # Generated test queries
-├── responses.json            # Model responses
-└── rubrics.json              # Evaluation criteria
-```
+    ```
+    evaluation_results/
+    ├── evaluation_report.md      # Generated Markdown report
+    ├── comparison_details.json   # All pairwise comparison details
+    ├── evaluation_results.json   # Final rankings and statistics
+    ├── queries.json              # Generated test queries
+    ├── responses.json            # Model responses
+    └── rubrics.json              # Evaluation criteria
+    ```
 
+    !!! tip "Example Report"
+        View a real report: [Oncology Medical Translation Evaluation](sample_reports/oncology_translation_report.md)
 
-## Checkpoint & Resume
+=== "Checkpoint & Resume"
 
-Evaluations automatically save checkpoints, allowing resumption after interruptions:
+    Evaluations automatically save checkpoints for resumption after interruptions:
 
-```bash
-# First run (interrupted)
-python -m cookbooks.zero_shot_evaluation --config config.yaml --save
-# Progress saved at: ./evaluation_results/checkpoint.json
+    ```bash
+    # First run (interrupted)
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --save
 
-# Resume from checkpoint (automatic)
-python -m cookbooks.zero_shot_evaluation --config config.yaml --save
-# Resumes from last completed step
+    # Resume from checkpoint (automatic)
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --save
 
-# Start fresh (ignore checkpoint)
-python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
-```
+    # Start fresh (ignore checkpoint)
+    python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
+    ```
 
-!!! info "Checkpoint Stages"
-    1. `QUERIES_GENERATED` — Test queries saved
-    2. `RESPONSES_COLLECTED` — All responses saved
-    3. `RUBRICS_GENERATED` — Evaluation rubrics saved
-    4. `EVALUATION_COMPLETE` — Final results saved
+    Checkpoint stages: `QUERIES_GENERATED` → `RESPONSES_COLLECTED` → `RUBRICS_GENERATED` → `EVALUATION_COMPLETE`
 
 
 ## Best Practices
@@ -436,12 +355,6 @@ python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
     - Skip checkpoint resumption for long-running evaluations
     - Compare models with fundamentally different capabilities (e.g., text vs vision)
 
-
-## Next Steps
-
-- [Pairwise Evaluation](select_rank.md) — Compare models with pre-existing test data
-- [Refine Data Quality](data_refinement.md) — Use grader feedback to improve outputs
-- [Create Custom Graders](../building_graders/create_custom_graders.md) — Build specialized evaluation criteria
-- [Run Grading Tasks](../running_graders/run_tasks.md) — Scale evaluations with GradingRunner
+**Related Topics:** [Pairwise Evaluation](select_rank.md) · [Refine Data Quality](data_refinement.md) · [Create Custom Graders](../building_graders/create_custom_graders.md) · [Run Grading Tasks](../running_graders/run_tasks.md)