Write custom evaluators for domain-specific logic, external integrations, or specialized metrics.
All evaluators inherit from [Evaluator][pydantic_evals.evaluators.Evaluator] and must implement evaluate:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class ExactMatch(Evaluator):
"""Check if output exactly matches expected output."""
def evaluate(self, ctx: EvaluatorContext) -> bool:
return ctx.output == ctx.expected_outputKey Points:
- Use
@dataclassdecorator (required) - Inherit from
Evaluator - Implement
evaluate(self, ctx: EvaluatorContext) -> EvaluatorOutput - Return
bool,int,float,str, [EvaluationReason][pydantic_evals.evaluators.EvaluationReason], ordictof these
The context provides all information about the case execution:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class MyEvaluator(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> bool:
# Access case data
ctx.name # Case name
ctx.inputs # Task inputs
ctx.metadata # Case metadata
ctx.expected_output # Expected output (may be None)
ctx.output # Actual output
# Performance data
ctx.duration # Task execution time (seconds)
# Custom metrics/attributes (see metrics guide)
ctx.metrics # dict[str, int | float]
ctx.attributes # dict[str, Any]
# OpenTelemetry spans (if logfire configured)
ctx.span_tree # SpanTree for behavioral checks
return TrueAdd configurable parameters as dataclass fields:
from dataclasses import dataclass
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class ContainsKeyword(Evaluator):
keyword: str
case_sensitive: bool = True
def evaluate(self, ctx: EvaluatorContext) -> bool:
output = ctx.output
keyword = self.keyword
if not self.case_sensitive:
output = output.lower()
keyword = keyword.lower()
return keyword in output
# Usage
dataset = Dataset(
cases=[Case(name='test', inputs='This is important')],
evaluators=[
ContainsKeyword(keyword='important', case_sensitive=False),
],
)Simple pass/fail checks:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class IsValidJSON(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> bool:
try:
import json
json.loads(ctx.output)
return True
except Exception:
return FalseQuality metrics:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class LengthScore(Evaluator):
"""Score based on output length (0.0 = too short, 1.0 = ideal)."""
ideal_length: int = 100
tolerance: int = 20
def evaluate(self, ctx: EvaluatorContext) -> float:
length = len(ctx.output)
diff = abs(length - self.ideal_length)
if diff <= self.tolerance:
return 1.0
else:
# Decay score as we move away from ideal
score = max(0.0, 1.0 - (diff - self.tolerance) / self.ideal_length)
return scoreCategorical classifications:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class SentimentClassifier(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> str:
output_lower = ctx.output.lower()
if any(word in output_lower for word in ['error', 'failed', 'wrong']):
return 'negative'
elif any(word in output_lower for word in ['success', 'correct', 'great']):
return 'positive'
else:
return 'neutral'Add explanations to any result:
from dataclasses import dataclass
from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext
@dataclass
class SmartCheck(Evaluator):
threshold: float = 0.8
def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
score = self._calculate_score(ctx.output)
if score >= self.threshold:
return EvaluationReason(
value=True,
reason=f'Score {score:.2f} exceeds threshold {self.threshold}',
)
else:
return EvaluationReason(
value=False,
reason=f'Score {score:.2f} below threshold {self.threshold}',
)
def _calculate_score(self, output: str) -> float:
# Your scoring logic
return 0.75You can return multiple evaluations from one evaluator by returning a dictionary of key-value pairs.
from dataclasses import dataclass
from pydantic_evals.evaluators import (
EvaluationReason,
Evaluator,
EvaluatorContext,
EvaluatorOutput,
)
@dataclass
class ComprehensiveCheck(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> EvaluatorOutput:
format_valid = self._check_format(ctx.output)
return {
'valid_format': EvaluationReason(
value=format_valid,
reason='Valid JSON format' if format_valid else 'Invalid JSON format',
),
'quality_score': self._score_quality(ctx.output), # float
'category': self._classify(ctx.output), # str
}
def _check_format(self, output: str) -> bool:
return output.startswith('{') and output.endswith('}')
def _score_quality(self, output: str) -> float:
return len(output) / 100.0
def _classify(self, output: str) -> str:
return 'short' if len(output) < 50 else 'long'Each key in the returned dictionary becomes a separate result in the report. Values can be:
- Primitives (
bool,int,float,str) - [
EvaluationReason][pydantic_evals.evaluators.EvaluationReason] (value with explanation) - Nested dicts of these types
The [EvaluatorOutput][pydantic_evals.evaluators.evaluator.EvaluatorOutput] type represents all legal values
that can be returned by an evaluator, and can be used as the return type annotation for your custom evaluate method.
Evaluators can dynamically choose whether to produce results for a given case by returning an empty dict when not applicable:
from dataclasses import dataclass
from pydantic_evals.evaluators import (
EvaluationReason,
Evaluator,
EvaluatorContext,
EvaluatorOutput,
)
@dataclass
class SQLValidator(Evaluator):
"""Only evaluates SQL queries, skips other outputs."""
def evaluate(self, ctx: EvaluatorContext) -> EvaluatorOutput:
# Check if this case is relevant for SQL validation
if not isinstance(ctx.output, str) or not ctx.output.strip().upper().startswith(
('SELECT', 'INSERT', 'UPDATE', 'DELETE')
):
# Return empty dict - this evaluator doesn't apply to this case
return {}
# This is a SQL query, perform validation
try:
# In real implementation, use sqlparse or similar
is_valid = self._validate_sql(ctx.output)
return {
'sql_valid': is_valid,
'sql_complexity': self._measure_complexity(ctx.output),
}
except Exception as e:
return {'sql_valid': EvaluationReason(False, reason=f'Exception: {e}')}
def _validate_sql(self, query: str) -> bool:
# Simplified validation
return 'FROM' in query.upper() or 'INTO' in query.upper()
def _measure_complexity(self, query: str) -> str:
joins = query.upper().count('JOIN')
if joins == 0:
return 'simple'
elif joins <= 2:
return 'moderate'
else:
return 'complex'This pattern is useful when:
- An evaluator only applies to certain types of outputs (e.g., code validation only for code outputs)
- Validation depends on metadata tags (e.g., only evaluate cases marked with
language='python') - You want to run expensive checks conditionally based on other evaluator results
Key Points:
- Returning
{}means "this evaluator doesn't apply here" - the case won't show results from this evaluator - Returning
{'key': value}means "this evaluator applies and here are the results" - This is more practical than using case-level evaluators when it applies to a large fraction of cases, or when the condition is based on the output itself
- The evaluator still runs for every case, but can short-circuit when not relevant
Use async def for I/O-bound operations:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class APIValidator(Evaluator):
api_url: str
async def evaluate(self, ctx: EvaluatorContext) -> bool:
import httpx
async with httpx.AsyncClient() as client:
response = await client.post(
self.api_url,
json={'output': ctx.output},
)
return response.json()['valid']Pydantic Evals handles both sync and async evaluators automatically.
Access case metadata for context-aware evaluation:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class DifficultyAwareScore(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> float:
# Base score
base_score = self._score_output(ctx.output)
# Adjust based on difficulty from metadata
if ctx.metadata and 'difficulty' in ctx.metadata:
difficulty = ctx.metadata['difficulty']
if difficulty == 'easy':
# Penalize mistakes more on easy questions
return base_score
elif difficulty == 'hard':
# Be more lenient on hard questions
return min(1.0, base_score * 1.2)
return base_score
def _score_output(self, output: str) -> float:
# Your scoring logic
return 0.8Access custom metrics set during task execution:
from dataclasses import dataclass
from pydantic_evals import increment_eval_metric, set_eval_attribute
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
# In your task
def my_task(inputs: str) -> str:
result = f'processed: {inputs}'
# Record metrics
increment_eval_metric('api_calls', 3)
set_eval_attribute('used_cache', True)
return result
# In your evaluator
@dataclass
class EfficiencyCheck(Evaluator):
max_api_calls: int = 5
def evaluate(self, ctx: EvaluatorContext) -> bool:
api_calls = ctx.metrics.get('api_calls', 0)
return api_calls <= self.max_api_callsSee Metrics & Attributes Guide for more.
Make evaluators type-safe with generics:
from dataclasses import dataclass
from typing import TypeVar
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
InputsT = TypeVar('InputsT')
OutputT = TypeVar('OutputT')
@dataclass
class TypedEvaluator(Evaluator[InputsT, OutputT, dict]):
def evaluate(self, ctx: EvaluatorContext[InputsT, OutputT, dict]) -> bool:
# ctx.inputs and ctx.output are now properly typed
return TrueControl how evaluations appear in reports:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class CustomNameEvaluator(Evaluator):
check_type: str
def get_default_evaluation_name(self) -> str:
# Use check_type as the name instead of class name
return f'{self.check_type}_check'
def evaluate(self, ctx: EvaluatorContext) -> bool:
return True
# In reports, appears as "format_check" instead of "CustomNameEvaluator"
evaluator = CustomNameEvaluator(check_type='format')Or use the evaluation_name field (if using the built-in pattern):
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class MyEvaluator(Evaluator):
evaluation_name: str | None = None
def evaluate(self, ctx: EvaluatorContext) -> bool:
return True
# Usage
MyEvaluator(evaluation_name='my_custom_name')from dataclasses import dataclass
from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext
@dataclass
class ValidSQL(Evaluator):
dialect: str = 'postgresql'
def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
try:
import sqlparse
parsed = sqlparse.parse(ctx.output)
if not parsed:
return EvaluationReason(
value=False,
reason='Could not parse SQL',
)
# Check for dangerous operations
sql_upper = ctx.output.upper()
if 'DROP' in sql_upper or 'DELETE' in sql_upper:
return EvaluationReason(
value=False,
reason='Contains dangerous operations (DROP/DELETE)',
)
return EvaluationReason(
value=True,
reason='Valid SQL syntax',
)
except Exception as e:
return EvaluationReason(
value=False,
reason=f'SQL parsing error: {e}',
)from dataclasses import dataclass
from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext
@dataclass
class ExecutablePython(Evaluator):
timeout_seconds: float = 5.0
async def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
import asyncio
import os
import tempfile
# Write code to temp file
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
f.write(ctx.output)
temp_path = f.name
try:
# Execute with timeout
process = await asyncio.create_subprocess_exec(
'python', temp_path,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(
process.communicate(),
timeout=self.timeout_seconds,
)
except asyncio.TimeoutError:
process.kill()
return EvaluationReason(
value=False,
reason=f'Execution timeout after {self.timeout_seconds}s',
)
if process.returncode == 0:
return EvaluationReason(
value=True,
reason='Code executed successfully',
)
else:
return EvaluationReason(
value=False,
reason=f'Execution failed: {stderr.decode()}',
)
finally:
os.unlink(temp_path)from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class APIResponseValid(Evaluator):
api_endpoint: str
api_key: str
async def evaluate(self, ctx: EvaluatorContext) -> dict[str, bool | float]:
import httpx
try:
async with httpx.AsyncClient() as client:
response = await client.post(
self.api_endpoint,
headers={'Authorization': f'Bearer {self.api_key}'},
json={'data': ctx.output},
timeout=10.0,
)
result = response.json()
return {
'api_reachable': True,
'validation_passed': result.get('valid', False),
'confidence_score': result.get('confidence', 0.0),
}
except Exception:
return {
'api_reachable': False,
'validation_passed': False,
'confidence_score': 0.0,
}Test evaluators like any other Python code:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class ExactMatch(Evaluator):
"""Check if output exactly matches expected output."""
def evaluate(self, ctx: EvaluatorContext) -> bool:
return ctx.output == ctx.expected_output
def test_exact_match():
evaluator = ExactMatch()
# Test match
ctx = EvaluatorContext(
name='test',
inputs='input',
metadata=None,
expected_output='expected',
output='expected',
duration=0.1,
_span_tree=None,
attributes={},
metrics={},
)
assert evaluator.evaluate(ctx) is True
# Test mismatch
ctx.output = 'different'
assert evaluator.evaluate(ctx) is FalseEach evaluator should check one thing:
from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
def check_format(output: str) -> bool:
return output.startswith('{')
def check_content(output: str) -> bool:
return len(output) > 10
def check_length(output: str) -> bool:
return len(output) < 1000
def check_spelling(output: str) -> bool:
return True # Placeholder
# Bad: Doing too much
@dataclass
class EverythingChecker(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> dict:
return {
'format_valid': check_format(ctx.output),
'content_good': check_content(ctx.output),
'length_ok': check_length(ctx.output),
'spelling_correct': check_spelling(ctx.output),
}
# Good: Separate evaluators
@dataclass
class FormatValidator(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> bool:
return check_format(ctx.output)
@dataclass
class ContentChecker(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> bool:
return check_content(ctx.output)
@dataclass
class LengthChecker(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> bool:
return check_length(ctx.output)
@dataclass
class SpellingChecker(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> bool:
return check_spelling(ctx.output)Some exceptions to this:
- When there is a significant amount of shared computation or network request latency, it may be better to have a single evaluator calculate all dependent outputs together.
- If multiple checks are tightly coupled or very closely related to each other, it may make sense to include all their logic in one evaluator.
from dataclasses import dataclass
from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext
@dataclass
class SafeEvaluator(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
if ctx.expected_output is None:
return EvaluationReason(
value=True,
reason='Skipped: no expected output provided',
)
# Your evaluation logic
...from dataclasses import dataclass
from pydantic_evals.evaluators import EvaluationReason, Evaluator, EvaluatorContext
@dataclass
class HelpfulEvaluator(Evaluator):
def evaluate(self, ctx: EvaluatorContext) -> EvaluationReason:
# Bad
return EvaluationReason(value=False, reason='Failed')
# Good
return EvaluationReason(
value=False,
reason=f'Expected {ctx.expected_output!r}, got {ctx.output!r}',
)from dataclasses import dataclass
from pydantic_evals.evaluators import Evaluator, EvaluatorContext
@dataclass
class APIEvaluator(Evaluator):
timeout: float = 10.0
async def _call_api(self, output: str) -> bool:
# Placeholder for API call
return True
async def evaluate(self, ctx: EvaluatorContext) -> bool:
import asyncio
try:
return await asyncio.wait_for(
self._call_api(ctx.output),
timeout=self.timeout,
)
except asyncio.TimeoutError:
return False- Span-Based Evaluation - Using OpenTelemetry spans
- Examples - Practical examples