Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,7 @@ credentials.json

# Temporary and Backup Files
_del
examples/
/examples/

# Evaluation Results
/evaluation_results
241 changes: 241 additions & 0 deletions cookbooks/zero_shot_evaluation/examples/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
# =============================================================================
# Zero-Shot Evaluation Configuration
# =============================================================================
# This configuration file defines all settings for the zero-shot evaluation
# pipeline, including task definition, query generation, target endpoints,
# judge endpoint, evaluation parameters, and output settings.
#
# Environment variables can be referenced using ${VAR_NAME} syntax.
# =============================================================================

# =============================================================================
# Task Configuration
# =============================================================================
# Defines the task that the target models/agents will be evaluated on.

task:
# [Required] A clear description of what the task is about.
# This helps the query generator create relevant test queries.
description: "English to Chinese translation assistant, helping users translate various types of English content into fluent and accurate Chinese"

# [Optional] The usage scenario or context for this task.
# Provides additional context for query generation.
scenario: "Users need to translate English articles, documents, or text into Chinese"

# =============================================================================
# Query Generation Configuration
# =============================================================================
# Settings for automatic test query generation.

query_generation:
# ---------------------------------------------------------------------------
# Basic Settings
# ---------------------------------------------------------------------------

# [Optional, default=20] Total number of queries to generate.
num_queries: 20

# [Optional] Seed queries to guide the generation style and format.
# These examples help the generator understand what kind of queries to create.
seed_queries:
- "Please translate the following paragraph into Chinese: 'The rapid advancement of artificial intelligence has transformed numerous industries.'"
- "Translate this sentence to Chinese: 'Climate change poses significant challenges to global food security.'"

# [Optional] Query categories with weights for stratified generation.
# Each category can have a name, description, and weight.
# If not specified, queries are generated without category constraints.
# categories:
# - name: "technical"
# description: "Technical documents and papers"
# weight: 0.3
# - name: "literary"
# description: "Literary and creative content"
# weight: 0.3
# - name: "business"
# description: "Business and formal documents"
# weight: 0.4

# ---------------------------------------------------------------------------
# Custom Endpoint (Optional)
# ---------------------------------------------------------------------------
# If not specified, uses judge_endpoint for query generation.

# endpoint:
# base_url: "https://api.openai.com/v1"
# api_key: "${OPENAI_API_KEY}"
# model: "gpt-4o"
# system_prompt: null # Optional system prompt for query generation
# extra_params: # Optional extra parameters
# temperature: 0.9

# ---------------------------------------------------------------------------
# Generation Control
# ---------------------------------------------------------------------------

# [Optional, default=10, range=1-50] Number of queries generated per API call.
# Higher values are more efficient but may reduce diversity.
queries_per_call: 10

# [Optional, default=3, min=1] Number of parallel batches for generation.
# Increases throughput but uses more API quota concurrently.
num_parallel_batches: 3

# [Optional, default=0.9, range=0.0-2.0] Sampling temperature.
# Higher values increase diversity but may reduce quality.
temperature: 0.9

# [Optional, default=0.95, range=0.0-1.0] Top-p (nucleus) sampling.
# Controls the cumulative probability threshold for token selection.
top_p: 0.95

# ---------------------------------------------------------------------------
# Deduplication
# ---------------------------------------------------------------------------

# [Optional, default=0.85, range=0.0-1.0] Maximum similarity threshold.
# Queries with similarity above this threshold are considered duplicates.
# Lower values enforce stricter deduplication.
max_similarity: 0.85

# ---------------------------------------------------------------------------
# Evol-Instruct Complexity Evolution
# ---------------------------------------------------------------------------
# Evol-Instruct progressively increases query complexity through
# multiple evolution rounds.

# [Optional, default=false] Enable complexity evolution.
enable_evolution: false

# [Optional, default=1, range=0-3] Number of evolution rounds.
# Each round increases the complexity of queries.
evolution_rounds: 1

# [Optional] Complexity evolution strategies to apply.
# Available strategies:
# - "constraints": Add constraints and requirements
# - "reasoning": Require multi-step reasoning
# - "edge_cases": Include edge cases and corner scenarios
# - "specificity": Make queries more specific and detailed
# - "multi_step": Require multiple steps to complete
complexity_levels:
- "constraints"
- "reasoning"
- "edge_cases"

# =============================================================================
# Target Endpoints
# =============================================================================
# Define the models or agents to be evaluated. Each endpoint is identified
# by a unique name and configured with connection details.

target_endpoints:
# Example: GPT-4 as baseline
gpt4_baseline:
# [Required] API base URL (OpenAI-compatible format)
base_url: "https://api.openai.com/v1"

# [Required] API key (supports ${ENV_VAR} format for security)
api_key: "${OPENAI_API_KEY}"

# [Required] Model name/identifier
model: "gpt-4"

# [Optional] System prompt to set the model's behavior
system_prompt: "You are a professional English-Chinese translator. Provide accurate and fluent translations."

# [Optional] Extra parameters passed to the API request
extra_params:
temperature: 0.7
max_tokens: 2048

# Example: Qwen model as candidate
qwen_candidate:
base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"
api_key: "${DASHSCOPE_API_KEY}"
model: "qwen-max"
system_prompt: "You are a professional English-Chinese translator. Provide accurate and fluent translations."
extra_params:
temperature: 0.7
max_tokens: 2048

# =============================================================================
# Judge Endpoint
# =============================================================================
# The judge model evaluates and compares responses from target endpoints.
# It should be a capable model that can assess quality objectively.

judge_endpoint:
# [Required] API base URL
base_url: "https://dashscope.aliyuncs.com/compatible-mode/v1"

# [Required] API key
api_key: "${DASHSCOPE_API_KEY}"

# [Required] Model name (recommend using a strong model for judging)
model: "qwen-max"

# [Optional] System prompt for the judge
# If not specified, a default judging prompt will be used.
system_prompt: null

# [Optional] Extra parameters for the judge model
# Lower temperature is recommended for more consistent judgments.
extra_params:
temperature: 0.1
max_tokens: 4096

# =============================================================================
# Evaluation Configuration
# =============================================================================
# Settings that control the evaluation process.

evaluation:
# [Optional, default=10] Maximum number of concurrent API requests.
# Higher values increase throughput but may hit rate limits.
max_concurrency: 10

# [Optional, default=60] Request timeout in seconds.
# Increase for complex tasks or slow endpoints.
timeout: 60

# [Optional, default=3] Number of retry attempts for failed requests.
retry_times: 3

# =============================================================================
# Output Configuration
# =============================================================================
# Settings for saving evaluation results and intermediate data.

output:
# [Optional, default=true] Save generated queries to a JSON file.
save_queries: true

# [Optional, default=true] Save all model responses to a JSON file.
save_responses: true

# [Optional, default=true] Save detailed evaluation results including
# individual judgments and scores.
save_details: true

# [Optional, default="./evaluation_results"] Directory for output files.
# Supports relative and absolute paths.
output_dir: "./evaluation_results"

# =============================================================================
# Report Configuration
# =============================================================================
# Settings for generating evaluation reports. When enabled, a comprehensive
# Markdown report explaining the rankings with concrete examples is generated.

report:
# [Optional, default=false] Enable report generation.
# When true, generates a detailed Markdown report after evaluation.
enabled: true

# [Optional, default="zh"] Report language.
# Supported values: "zh" (Chinese), "en" (English)
language: "zh"

# [Optional, default=3, range=1-10] Number of examples per section.
# Controls how many concrete examples are included in the report.
include_examples: 3
35 changes: 35 additions & 0 deletions cookbooks/zero_shot_evaluation/examples/minimal_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# =============================================================================
# Minimal Configuration Example
# =============================================================================
# This is the minimum required configuration for zero-shot evaluation.
# Only required fields are specified; all other settings use defaults.
# =============================================================================

# Task description (required)
task:
description: "Academic GPT assistant for research and writing tasks"

# Target endpoints to evaluate (required, at least one)
target_endpoints:
model_v1:
base_url: "https://api.openai.com/v1"
api_key: "${OPENAI_API_KEY}"
model: "gpt-4"

model_v2:
base_url: "https://api.openai.com/v1"
api_key: "${OPENAI_API_KEY}"
model: "gpt-3.5-turbo"

# Judge endpoint for evaluation (required)
judge_endpoint:
base_url: "https://api.openai.com/v1"
api_key: "${OPENAI_API_KEY}"
model: "gpt-4"

# All other settings use defaults:
# - query_generation.num_queries: 20
# - query_generation.temperature: 0.9
# - evaluation.max_concurrency: 10
# - evaluation.timeout: 60
# - output.output_dir: "./evaluation_results"
Loading