Skip to content

Commit 2c7dc5c

Browse files
committed
add gepa examples
1 parent 30cd4d1 commit 2c7dc5c

13 files changed

+1151
-178
lines changed

examples/llm_prompt_optimization/README.md

Lines changed: 270 additions & 166 deletions
Large diffs are not rendered by default.
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Configuration for baseline benchmarking with Qwen3-8B
2+
# Using OpenRouter API for model access
3+
4+
# General settings
5+
max_iterations: 1 # Just one iteration for baseline
6+
checkpoint_interval: 1
7+
log_level: "INFO"
8+
diff_based_evolution: false
9+
max_code_length: 10000
10+
language: "text"
11+
12+
# LLM Configuration for Qwen3-8B via OpenRouter
13+
llm:
14+
api_base: "https://openrouter.ai/api/v1"
15+
models:
16+
- name: "qwen/qwen3-8b" # Using exact Qwen3-8B model for GEPA comparison
17+
weight: 1.0
18+
19+
temperature: 0.1 # Low temperature for consistent baseline results
20+
max_tokens: 4096 # Reasonable context for Qwen
21+
timeout: 300 # Longer timeout for full dataset evaluation
22+
retries: 3
23+
24+
# Prompt Configuration - Not used for baseline but required
25+
prompt:
26+
template_dir: "templates"
27+
num_top_programs: 3
28+
num_diverse_programs: 2
29+
include_artifacts: true
30+
31+
system_message: |
32+
You are a helpful assistant.
33+
34+
# Database Configuration - Minimal for baseline
35+
database:
36+
population_size: 1
37+
archive_size: 1
38+
num_islands: 1
39+
40+
feature_dimensions: ["prompt_length", "reasoning_strategy"]
41+
feature_bins: 10
42+
43+
elite_selection_ratio: 1.0
44+
exploration_ratio: 0.0
45+
exploitation_ratio: 0.0
46+
47+
migration_interval: 10
48+
migration_rate: 0.0
49+
50+
# Evaluator Configuration for baseline
51+
evaluator:
52+
timeout: 3600 # 1 hour timeout for full dataset
53+
max_retries: 3
54+
parallel_evaluations: 1 # Sequential for baseline
55+
cascade_evaluation: false # No cascading for baseline
56+
57+
# Disable LLM feedback for baseline
58+
use_llm_feedback: false
59+
llm_feedback_weight: 0.0
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Configuration for evolving prompts with Qwen3-8B
2+
# Optimized for GEPA benchmark comparison
3+
4+
# General settings
5+
max_iterations: 100 # Can be overridden by command line
6+
checkpoint_interval: 10
7+
log_level: "INFO"
8+
diff_based_evolution: false # Full rewrites for prompt evolution
9+
max_code_length: 10000
10+
language: "text"
11+
12+
# LLM Configuration for Qwen3-8B via OpenRouter
13+
llm:
14+
api_base: "https://openrouter.ai/api/v1"
15+
models:
16+
- name: "qwen/qwen3-8b"
17+
weight: 1.0
18+
19+
temperature: 0.8 # Higher temperature for creative evolution
20+
max_tokens: 4096
21+
timeout: 60
22+
retries: 3
23+
24+
# Prompt Configuration for evolution
25+
prompt:
26+
template_dir: "templates"
27+
num_top_programs: 5 # Show top 5 prompts for inspiration
28+
num_diverse_programs: 3 # Include 3 diverse prompts
29+
include_artifacts: true
30+
31+
system_message: |
32+
You are an expert at creating effective prompts for language models.
33+
Your goal is to evolve prompts that maximize accuracy on the given task.
34+
35+
When creating new prompts:
36+
1. Build on successful patterns from the examples
37+
2. Be creative but maintain clarity
38+
3. Consider different reasoning strategies (direct, step-by-step, few-shot)
39+
4. Optimize for the specific task requirements
40+
41+
# Database Configuration for MAP-Elites
42+
database:
43+
population_size: 50 # Moderate population for balance
44+
archive_size: 500
45+
num_islands: 4 # Multiple islands for diversity
46+
47+
feature_dimensions: ["prompt_length", "reasoning_strategy"]
48+
feature_bins: 10
49+
50+
elite_selection_ratio: 0.4 # 40% elites
51+
exploration_ratio: 0.3 # 30% exploration
52+
exploitation_ratio: 0.3 # 30% exploitation
53+
54+
migration_interval: 20
55+
migration_rate: 0.1
56+
57+
# Evaluator Configuration
58+
evaluator:
59+
timeout: 1800 # 30 minutes timeout for complex evaluations
60+
max_retries: 3
61+
parallel_evaluations: 4 # Parallel evaluation for speed
62+
cascade_evaluation: true # Use cascading to save API calls
63+
cascade_thresholds: [0.9] # Only 2 stages, must achieve 90% in stage 1 to proceed
64+
65+
# Enable LLM feedback for better guidance
66+
use_llm_feedback: true
67+
llm_feedback_weight: 0.2 # 20% weight on qualitative feedback

0 commit comments

Comments
 (0)