@@ -33,9 +33,9 @@ Zero-shot evaluation automates the entire evaluation pipeline:
3333| ------| -----------| -------------|
3434| 1 | ` QueryGenerator ` | Generate diverse test queries from task description |
3535| 2 | ` ResponseCollector ` | Collect responses from all target endpoints |
36- | 3 | ` RubricGenerator ` | Generate evaluation criteria for the task |
36+ | 3 | ` TaskBasedRubricGenerator ` | Generate evaluation criteria for the task |
3737| 4 | ` GradingRunner ` | Run pairwise comparisons with judge model |
38- | 5 | ` ZeroShotEvaluator ` | Analyze results and produce rankings |
38+ | 5 | ` ZeroShotPipeline ` | Analyze results and produce rankings |
3939
4040
4141## Quick Start
@@ -44,11 +44,11 @@ Zero-shot evaluation automates the entire evaluation pipeline:
4444
4545``` python
4646import asyncio
47- from cookbooks.zero_shot_evaluation import ZeroShotEvaluator
47+ from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
4848
4949async def main ():
50- evaluator = ZeroShotEvaluator .from_config(" config.yaml" )
51- result = await evaluator .evaluate()
50+ pipeline = ZeroShotPipeline .from_config(" config.yaml" )
51+ result = await pipeline .evaluate()
5252
5353 print (f " Best Model: { result.best_pipeline} " )
5454 for rank, (model, win_rate) in enumerate (result.rankings, 1 ):
@@ -137,7 +137,8 @@ For fine-grained control, use individual components directly:
137137# ## Step 1: Generate Test Queries
138138
139139` ` ` python
140- from cookbooks.zero_shot_evaluation import QueryGenerator, TaskConfig, QueryGenerationConfig, OpenAIEndpoint
140+ from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
141+ from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint
141142
142143# Configure task and endpoint
143144task = TaskConfig(
@@ -171,7 +172,8 @@ queries = await generator.generate()
171172# ## Step 2: Collect Responses
172173
173174` ` ` python
174- from cookbooks.zero_shot_evaluation import ResponseCollector, EvaluationConfig
175+ from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
176+ from cookbooks.zero_shot_evaluation.schema import EvaluationConfig
175177
176178collector = ResponseCollector(
177179 target_endpoints={
@@ -187,9 +189,13 @@ responses = await collector.collect(queries)
187189# ## Step 3: Generate Evaluation Rubrics
188190
189191` ` ` python
190- from cookbooks.zero_shot_evaluation import RubricGenerator
192+ from openjudge.generator.simple_rubric import TaskBasedRubricGenerator, RubricGenerationConfig
191193
192- rubric_gen = RubricGenerator(judge_endpoint, task)
194+ rubric_config = RubricGenerationConfig(
195+ task_description=task.description,
196+ scenario=task.scenario,
197+ )
198+ rubric_gen = TaskBasedRubricGenerator(config=rubric_config, model=judge_model)
193199rubrics = await rubric_gen.generate(
194200 sample_queries=[q.query for q in queries[:5]]
195201)
@@ -203,16 +209,16 @@ rubrics = await rubric_gen.generate(
203209# ## Step 4: Run Full Evaluation
204210
205211` ` ` python
206- from cookbooks.zero_shot_evaluation import ZeroShotEvaluator
212+ from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
207213
208- evaluator = ZeroShotEvaluator (
214+ pipeline = ZeroShotPipeline (
209215 task_description="Code review assistant",
210216 target_endpoints=target_endpoints,
211217 judge_endpoint=judge_endpoint,
212218 num_queries=20
213219)
214220
215- result = await evaluator .evaluate()
221+ result = await pipeline .evaluate()
216222` ` `
217223
218224
0 commit comments