Skip to content

Commit 52eb155

Browse files
committed
refactor: simplify TaskBasedRubricGenerator API and update docs
- Remove RubricGenerationConfig class, pass parameters directly to constructor - Add DEFAULT_RUBRICS constant for fallback rubrics - Update zero_shot_evaluation docs with pre-defined queries usage guide - Simplify related tests
1 parent de4fc84 commit 52eb155

File tree

7 files changed

+102
-209
lines changed

7 files changed

+102
-209
lines changed

cookbooks/zero_shot_evaluation/zero_shot_pipeline.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434

3535
# OpenJudge core components
3636
from openjudge.analyzer import PairwiseAnalyzer, PairwiseAnalysisResult
37-
from openjudge.generator.simple_rubric import RubricGenerationConfig, TaskBasedRubricGenerator
37+
from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
3838
from openjudge.graders.llm_grader import GraderMode, LLMGrader
3939
from openjudge.graders.schema import GraderResult
4040
from openjudge.models.openai_chat_model import OpenAIChatModel
@@ -415,13 +415,10 @@ async def generate_rubrics(
415415
sample_queries = [q.query for q in self._queries[:5]]
416416

417417
# Use OpenJudge's TaskBasedRubricGenerator
418-
rubric_config = RubricGenerationConfig(
419-
task_description=self.config.task.description,
420-
scenario=self.config.task.scenario,
421-
)
422418
generator = TaskBasedRubricGenerator(
423-
config=rubric_config,
424419
model=self._create_judge_model(),
420+
task_description=self.config.task.description,
421+
scenario=self.config.task.scenario,
425422
)
426423
self._rubrics = await generator.generate(sample_queries)
427424
return self._rubrics

docs/applications/zero_shot_evaluation.md

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,36 @@ python -m cookbooks.zero_shot_evaluation --config config.yaml --fresh --save
7373
python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
7474
```
7575

76+
### Using Pre-defined Queries
77+
78+
Skip query generation by providing your own queries file. This is useful when you want to evaluate models on a specific set of questions.
79+
80+
**Create a queries file** (`queries.json`):
81+
82+
```json
83+
[
84+
{"query": "Translate: AI is transforming industries."},
85+
{"query": "Translate: The weather is nice today."},
86+
{"query": "Translate: How to learn programming effectively?"}
87+
]
88+
```
89+
90+
The `category` and `difficulty` fields are optional:
91+
92+
```json
93+
[
94+
{"query": "Your question here", "category": "general", "difficulty": "easy"}
95+
]
96+
```
97+
98+
**Run evaluation**:
99+
100+
```bash
101+
python -m cookbooks.zero_shot_evaluation --config config.yaml --queries_file queries.json --save
102+
```
103+
104+
The pipeline will skip query generation and directly use your queries for model comparison.
105+
76106

77107
## Configuration
78108

@@ -189,13 +219,13 @@ responses = await collector.collect(queries)
189219
### Step 3: Generate Evaluation Rubrics
190220

191221
```python
192-
from openjudge.generator.simple_rubric import TaskBasedRubricGenerator, RubricGenerationConfig
222+
from openjudge.generator.simple_rubric import TaskBasedRubricGenerator
193223
194-
rubric_config = RubricGenerationConfig(
224+
rubric_gen = TaskBasedRubricGenerator(
225+
model=judge_model,
195226
task_description=task.description,
196227
scenario=task.scenario,
197228
)
198-
rubric_gen = TaskBasedRubricGenerator(config=rubric_config, model=judge_model)
199229
rubrics = await rubric_gen.generate(
200230
sample_queries=[q.query for q in queries[:5]]
201231
)

openjudge/generator/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@
1818
SimpleRubricsGenerator: Main generator for simple rubric-based graders
1919
SimpleRubricsGeneratorConfig: Configuration for simple rubric generation
2020
TaskBasedRubricGenerator: Core rubric generation logic
21-
RubricGenerationConfig: Configuration for rubric generation
21+
22+
Constants:
23+
DEFAULT_RUBRICS: Default fallback rubrics if generation fails
2224
"""
2325

2426
from openjudge.generator.base_generator import BaseGraderGenerator, GraderGeneratorConfig
2527
from openjudge.generator.llm_grader_generator import LLMGraderGenerator, LLMGraderGeneratorConfig
2628

2729
# Simple rubric generation
2830
from openjudge.generator.simple_rubric import (
29-
RubricGenerationConfig,
31+
DEFAULT_RUBRICS,
3032
SimpleRubricsGenerator,
3133
SimpleRubricsGeneratorConfig,
3234
TaskBasedRubricGenerator,
@@ -42,5 +44,5 @@
4244
"SimpleRubricsGenerator",
4345
"SimpleRubricsGeneratorConfig",
4446
"TaskBasedRubricGenerator",
45-
"RubricGenerationConfig",
47+
"DEFAULT_RUBRICS",
4648
]

openjudge/generator/simple_rubric/__init__.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,17 @@
1212
SimpleRubricsGenerator: Main generator class that creates LLMGrader instances
1313
SimpleRubricsGeneratorConfig: Configuration for the generator
1414
TaskBasedRubricGenerator: Core rubric generation logic
15-
RubricGenerationConfig: Configuration for rubric generation
15+
16+
Constants:
17+
DEFAULT_RUBRICS: Default fallback rubrics if generation fails
1618
"""
1719

1820
from openjudge.generator.simple_rubric.generator import (
1921
SimpleRubricsGenerator,
2022
SimpleRubricsGeneratorConfig,
2123
)
2224
from openjudge.generator.simple_rubric.rubric_generator import (
23-
RubricGenerationConfig,
25+
DEFAULT_RUBRICS,
2426
TaskBasedRubricGenerator,
2527
)
2628

@@ -30,6 +32,5 @@
3032
"SimpleRubricsGeneratorConfig",
3133
# Core rubric generation logic
3234
"TaskBasedRubricGenerator",
33-
"RubricGenerationConfig",
35+
"DEFAULT_RUBRICS",
3436
]
35-

openjudge/generator/simple_rubric/generator.py

Lines changed: 12 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,11 @@
3636
LLMGraderGeneratorConfig,
3737
)
3838
from openjudge.generator.simple_rubric.rubric_generator import (
39-
RubricGenerationConfig,
39+
DEFAULT_RUBRICS,
4040
TaskBasedRubricGenerator,
4141
)
4242
from openjudge.graders.llm_grader import LLMGrader
4343
from openjudge.graders.schema import GraderMode
44-
from openjudge.models.base_chat_model import BaseChatModel
4544
from openjudge.models.openai_chat_model import OpenAIChatModel
4645
from openjudge.models.schema.prompt_template import LanguageEnum
4746

@@ -55,15 +54,12 @@ class SimpleRubricsGeneratorConfig(LLMGraderGeneratorConfig):
5554
5655
Attributes:
5756
task_description: Description of the task for evaluation.
58-
Should describe what kind of queries and responses are expected.
5957
scenario: Optional usage scenario for context.
60-
Helps the generator understand the evaluation context.
6158
language: Language for prompts (ZH or EN). Defaults to EN.
6259
default_rubrics: Fallback rubrics if generation fails.
63-
These are used when LLM generation fails.
64-
max_retries: Maximum number of retry attempts for LLM calls. Defaults to 3.
65-
min_score: Minimum score for pointwise evaluation. Defaults to 0.
66-
max_score: Maximum score for pointwise evaluation. Defaults to 1.
60+
max_retries: Maximum number of retry attempts for LLM calls.
61+
min_score: Minimum score for pointwise evaluation.
62+
max_score: Maximum score for pointwise evaluation.
6763
6864
Inherited from LLMGraderGeneratorConfig:
6965
grader_name: Human-readable name for the generated grader.
@@ -72,24 +68,11 @@ class SimpleRubricsGeneratorConfig(LLMGraderGeneratorConfig):
7268
custom_evaluation_prompt: Custom template for evaluation.
7369
"""
7470

75-
# Task description parameters
7671
task_description: str = ""
7772
scenario: Optional[str] = None
7873
language: LanguageEnum = LanguageEnum.EN
79-
80-
# Fallback configuration
81-
default_rubrics: List[str] = field(
82-
default_factory=lambda: [
83-
"Accuracy: Whether the response is factually correct",
84-
"Relevance: Whether the response addresses the query",
85-
"Completeness: Whether the response is comprehensive",
86-
]
87-
)
88-
89-
# Generation parameters
74+
default_rubrics: List[str] = field(default_factory=lambda: DEFAULT_RUBRICS.copy())
9075
max_retries: int = 3
91-
92-
# Pointwise-specific parameters
9376
min_score: int = 0
9477
max_score: int = 1
9578

@@ -107,14 +90,6 @@ class SimpleRubricsGenerator(LLMGraderGenerator):
10790
2. Uses an LLM to generate relevant evaluation criteria
10891
3. Creates an LLMGrader configured with these rubrics
10992
110-
This is suitable for scenarios where:
111-
- You have a clear task description
112-
- You don't have labeled preference data for rubric learning
113-
- You want a quick way to set up evaluation
114-
115-
For more sophisticated rubric generation from preference data,
116-
see the iterative_rubric module.
117-
11893
Example:
11994
>>> config = SimpleRubricsGeneratorConfig(
12095
... grader_name="Medical QA Grader",
@@ -127,38 +102,25 @@ class SimpleRubricsGenerator(LLMGraderGenerator):
127102
... dataset=[],
128103
... sample_queries=["What are the symptoms of flu?"]
129104
... )
130-
>>> # Now use the grader to evaluate responses
131-
>>> result = await grader.aevaluate(query="...", response="...")
132105
"""
133106

134107
def __init__(self, config: SimpleRubricsGeneratorConfig) -> None:
135108
"""Initialize the simple rubrics generator.
136109
137110
Args:
138-
config: Configuration for rubric generation. Includes:
139-
- grader_name: Name for the generated grader
140-
- model: Language model for generation and evaluation
141-
- task_description: Description of the evaluation task
142-
- scenario: Optional usage scenario
143-
- language: Language for prompts (ZH or EN)
144-
- grader_mode: POINTWISE or LISTWISE
145-
- default_rubrics: Fallback rubrics if generation fails
111+
config: Configuration for rubric generation.
146112
"""
147113
super().__init__(config)
148114
self.config: SimpleRubricsGeneratorConfig = config
149115

150-
# Initialize the rubric generator
151-
rubric_config = RubricGenerationConfig(
116+
self._rubric_generator = TaskBasedRubricGenerator(
117+
model=config.model,
152118
task_description=config.task_description,
153119
scenario=config.scenario,
154120
language=config.language,
155121
default_rubrics=config.default_rubrics,
156122
max_retries=config.max_retries,
157123
)
158-
self._rubric_generator = TaskBasedRubricGenerator(
159-
config=rubric_config,
160-
model=config.model,
161-
)
162124

163125
async def generate(
164126
self,
@@ -168,28 +130,20 @@ async def generate(
168130
) -> LLMGrader:
169131
"""Generate an LLMGrader with rubrics from task description.
170132
171-
This method generates evaluation rubrics based on the task description
172-
and creates an LLMGrader instance configured with these rubrics.
173-
174133
Args:
175-
dataset: List of data dictionaries. For this generator, the dataset
176-
is optional and only used to extract sample queries if
177-
sample_queries is not provided.
134+
dataset: List of data dictionaries (used to extract sample queries
135+
if sample_queries is not provided).
178136
sample_queries: Optional list of sample queries for context.
179-
If not provided, queries may be extracted from dataset.
180137
**kwargs: Additional arguments (currently unused).
181138
182139
Returns:
183140
LLMGrader: Configured grader instance with generated rubrics.
184141
"""
185-
# Extract sample queries from dataset if not provided
186142
if sample_queries is None and dataset:
187143
sample_queries = [d.get("query", "") for d in dataset[:5] if d.get("query")]
188144

189-
# Generate rubrics
190-
rubrics = await self._generate_rubrics(dataset, sample_queries=sample_queries, **kwargs)
145+
rubrics = await self._generate_rubrics(sample_queries)
191146

192-
# Prepare grader kwargs
193147
grader_kwargs = {
194148
"name": self.config.grader_name,
195149
"model": self.config.model,
@@ -198,16 +152,13 @@ async def generate(
198152
"language": self.config.language,
199153
}
200154

201-
# Add min_score and max_score only for pointwise mode
202155
if self.config.grader_mode == GraderMode.POINTWISE:
203156
grader_kwargs["min_score"] = self.config.min_score
204157
grader_kwargs["max_score"] = self.config.max_score
205158

206-
# Add template: use custom if provided, otherwise use default based on mode
207159
if self.config.custom_evaluation_prompt is not None:
208160
grader_kwargs["template"] = self.config.custom_evaluation_prompt
209161
else:
210-
# Use default evaluation template based on grader mode
211162
if self.config.grader_mode == GraderMode.POINTWISE:
212163
grader_kwargs["template"] = POINTWISE_EVALUATION_TEMPLATE
213164
else:
@@ -217,35 +168,22 @@ async def generate(
217168

218169
async def _generate_rubrics(
219170
self,
220-
dataset: List[dict],
221171
sample_queries: Optional[List[str]] = None,
222-
**kwargs,
223172
) -> str:
224173
"""Generate rubrics from task description.
225174
226-
This method uses the TaskBasedRubricGenerator to create rubrics
227-
based on the task description and sample queries.
228-
229175
Args:
230-
dataset: List of data dictionaries (used for extracting sample queries
231-
if sample_queries is not provided).
232176
sample_queries: Optional list of sample queries for context.
233-
**kwargs: Additional arguments (currently unused).
234177
235178
Returns:
236179
str: Formatted string containing evaluation rubrics.
237180
"""
238-
# Generate rubrics as list
239-
rubrics_list = await self._rubric_generator.generate(
240-
sample_queries=sample_queries,
241-
)
181+
rubrics_list = await self._rubric_generator.generate(sample_queries=sample_queries)
242182

243-
# Format rubrics into a string
244183
formatted_rubrics = "\n\n".join(
245184
[f"{i + 1}. {rubric}" for i, rubric in enumerate(rubrics_list)]
246185
)
247186

248187
logger.info(f"Generated {len(rubrics_list)} rubrics from task description")
249188

250189
return formatted_rubrics
251-

0 commit comments

Comments
 (0)