Skip to content

Commit de4fc84

Browse files
committed
refactor: move rubric_generator to simple_rubric module and update docs
1 parent b1f5d98 commit de4fc84

File tree

7 files changed

+747
-47
lines changed

7 files changed

+747
-47
lines changed

cookbooks/zero_shot_evaluation/zero_shot_pipeline.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
# OpenJudge core components
3636
from openjudge.analyzer import PairwiseAnalyzer, PairwiseAnalysisResult
37-
from openjudge.generator import RubricGenerationConfig, RubricGenerator
37+
from openjudge.generator.simple_rubric import RubricGenerationConfig, TaskBasedRubricGenerator
3838
from openjudge.graders.llm_grader import GraderMode, LLMGrader
3939
from openjudge.graders.schema import GraderResult
4040
from openjudge.models.openai_chat_model import OpenAIChatModel
@@ -288,7 +288,7 @@ class ZeroShotPipeline:
288288
5. Analyze results and rank models
289289
290290
The pipeline integrates with OpenJudge's core components:
291-
- Uses RubricGenerator from openjudge.generator for rubric generation
291+
- Uses TaskBasedRubricGenerator from openjudge.generator.simple_rubric for rubric generation
292292
- Uses PairwiseAnalyzer from openjudge.analyzer for result analysis
293293
- Uses LLMGrader and GradingRunner for pairwise evaluation
294294
@@ -408,18 +408,18 @@ async def generate_rubrics(
408408
self,
409409
sample_queries: Optional[List[str]] = None,
410410
) -> List[str]:
411-
"""Step 3: Generate evaluation rubrics using OpenJudge's RubricGenerator."""
411+
"""Step 3: Generate evaluation rubrics using OpenJudge's TaskBasedRubricGenerator."""
412412
logger.info("Step 3: Generating evaluation rubrics...")
413413

414414
if not sample_queries and self._queries:
415415
sample_queries = [q.query for q in self._queries[:5]]
416416

417-
# Use OpenJudge's RubricGenerator
417+
# Use OpenJudge's TaskBasedRubricGenerator
418418
rubric_config = RubricGenerationConfig(
419419
task_description=self.config.task.description,
420420
scenario=self.config.task.scenario,
421421
)
422-
generator = RubricGenerator(
422+
generator = TaskBasedRubricGenerator(
423423
config=rubric_config,
424424
model=self._create_judge_model(),
425425
)

docs/applications/zero_shot_evaluation.md

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ Zero-shot evaluation automates the entire evaluation pipeline:
3333
|------|-----------|-------------|
3434
| 1 | `QueryGenerator` | Generate diverse test queries from task description |
3535
| 2 | `ResponseCollector` | Collect responses from all target endpoints |
36-
| 3 | `RubricGenerator` | Generate evaluation criteria for the task |
36+
| 3 | `TaskBasedRubricGenerator` | Generate evaluation criteria for the task |
3737
| 4 | `GradingRunner` | Run pairwise comparisons with judge model |
38-
| 5 | `ZeroShotEvaluator` | Analyze results and produce rankings |
38+
| 5 | `ZeroShotPipeline` | Analyze results and produce rankings |
3939

4040

4141
## Quick Start
@@ -44,11 +44,11 @@ Zero-shot evaluation automates the entire evaluation pipeline:
4444

4545
```python
4646
import asyncio
47-
from cookbooks.zero_shot_evaluation import ZeroShotEvaluator
47+
from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
4848

4949
async def main():
50-
evaluator = ZeroShotEvaluator.from_config("config.yaml")
51-
result = await evaluator.evaluate()
50+
pipeline = ZeroShotPipeline.from_config("config.yaml")
51+
result = await pipeline.evaluate()
5252

5353
print(f"Best Model: {result.best_pipeline}")
5454
for rank, (model, win_rate) in enumerate(result.rankings, 1):
@@ -137,7 +137,8 @@ For fine-grained control, use individual components directly:
137137
### Step 1: Generate Test Queries
138138

139139
```python
140-
from cookbooks.zero_shot_evaluation import QueryGenerator, TaskConfig, QueryGenerationConfig, OpenAIEndpoint
140+
from cookbooks.zero_shot_evaluation.query_generator import QueryGenerator
141+
from cookbooks.zero_shot_evaluation.schema import TaskConfig, QueryGenerationConfig, OpenAIEndpoint
141142
142143
# Configure task and endpoint
143144
task = TaskConfig(
@@ -171,7 +172,8 @@ queries = await generator.generate()
171172
### Step 2: Collect Responses
172173

173174
```python
174-
from cookbooks.zero_shot_evaluation import ResponseCollector, EvaluationConfig
175+
from cookbooks.zero_shot_evaluation.response_collector import ResponseCollector
176+
from cookbooks.zero_shot_evaluation.schema import EvaluationConfig
175177
176178
collector = ResponseCollector(
177179
target_endpoints={
@@ -187,9 +189,13 @@ responses = await collector.collect(queries)
187189
### Step 3: Generate Evaluation Rubrics
188190

189191
```python
190-
from cookbooks.zero_shot_evaluation import RubricGenerator
192+
from openjudge.generator.simple_rubric import TaskBasedRubricGenerator, RubricGenerationConfig
191193
192-
rubric_gen = RubricGenerator(judge_endpoint, task)
194+
rubric_config = RubricGenerationConfig(
195+
task_description=task.description,
196+
scenario=task.scenario,
197+
)
198+
rubric_gen = TaskBasedRubricGenerator(config=rubric_config, model=judge_model)
193199
rubrics = await rubric_gen.generate(
194200
sample_queries=[q.query for q in queries[:5]]
195201
)
@@ -203,16 +209,16 @@ rubrics = await rubric_gen.generate(
203209
### Step 4: Run Full Evaluation
204210

205211
```python
206-
from cookbooks.zero_shot_evaluation import ZeroShotEvaluator
212+
from cookbooks.zero_shot_evaluation.zero_shot_pipeline import ZeroShotPipeline
207213
208-
evaluator = ZeroShotEvaluator(
214+
pipeline = ZeroShotPipeline(
209215
task_description="Code review assistant",
210216
target_endpoints=target_endpoints,
211217
judge_endpoint=judge_endpoint,
212218
num_queries=20
213219
)
214220
215-
result = await evaluator.evaluate()
221+
result = await pipeline.evaluate()
216222
```
217223

218224

openjudge/generator/__init__.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,43 @@
44
This module provides generators for automatically creating graders and
55
evaluation criteria based on data or task descriptions.
66
7+
Submodules:
8+
simple_rubric: Task-description-based rubric generation (zero-shot)
9+
iterative_rubric: Preference-data-based rubric generation (iterative refinement)
10+
711
Classes:
812
BaseGraderGenerator: Abstract base class for grader generators
913
GraderGeneratorConfig: Configuration for grader generation
10-
RubricGenerator: Generator for evaluation rubrics
14+
LLMGraderGenerator: Base class for LLM-based grader generators
15+
LLMGraderGeneratorConfig: Configuration for LLM grader generation
16+
17+
# Simple rubric generation (from task description)
18+
SimpleRubricsGenerator: Main generator for simple rubric-based graders
19+
SimpleRubricsGeneratorConfig: Configuration for simple rubric generation
20+
TaskBasedRubricGenerator: Core rubric generation logic
1121
RubricGenerationConfig: Configuration for rubric generation
1222
"""
1323

1424
from openjudge.generator.base_generator import BaseGraderGenerator, GraderGeneratorConfig
15-
from openjudge.generator.rubric_generator import RubricGenerationConfig, RubricGenerator
25+
from openjudge.generator.llm_grader_generator import LLMGraderGenerator, LLMGraderGeneratorConfig
26+
27+
# Simple rubric generation
28+
from openjudge.generator.simple_rubric import (
29+
RubricGenerationConfig,
30+
SimpleRubricsGenerator,
31+
SimpleRubricsGeneratorConfig,
32+
TaskBasedRubricGenerator,
33+
)
1634

1735
__all__ = [
18-
# Grader Generator
36+
# Base classes
1937
"BaseGraderGenerator",
2038
"GraderGeneratorConfig",
21-
# Rubric Generator
22-
"RubricGenerator",
39+
"LLMGraderGenerator",
40+
"LLMGraderGeneratorConfig",
41+
# Simple rubric generation
42+
"SimpleRubricsGenerator",
43+
"SimpleRubricsGeneratorConfig",
44+
"TaskBasedRubricGenerator",
2345
"RubricGenerationConfig",
2446
]
25-
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# -*- coding: utf-8 -*-
2+
"""Simple rubric generator module for automatic evaluation criteria generation.
3+
4+
This module provides a simple, task-description-based approach to generating
5+
evaluation rubrics. It generates rubrics from task descriptions and sample
6+
queries, without requiring labeled training data.
7+
8+
This is in contrast to the iterative_rubric module which learns rubrics from
9+
preference data through an iterative refinement process.
10+
11+
Classes:
12+
SimpleRubricsGenerator: Main generator class that creates LLMGrader instances
13+
SimpleRubricsGeneratorConfig: Configuration for the generator
14+
TaskBasedRubricGenerator: Core rubric generation logic
15+
RubricGenerationConfig: Configuration for rubric generation
16+
"""
17+
18+
from openjudge.generator.simple_rubric.generator import (
19+
SimpleRubricsGenerator,
20+
SimpleRubricsGeneratorConfig,
21+
)
22+
from openjudge.generator.simple_rubric.rubric_generator import (
23+
RubricGenerationConfig,
24+
TaskBasedRubricGenerator,
25+
)
26+
27+
__all__ = [
28+
# Main generator (creates LLMGrader)
29+
"SimpleRubricsGenerator",
30+
"SimpleRubricsGeneratorConfig",
31+
# Core rubric generation logic
32+
"TaskBasedRubricGenerator",
33+
"RubricGenerationConfig",
34+
]
35+

0 commit comments

Comments
 (0)