Skip to content
Merged
277 changes: 264 additions & 13 deletions docs/evals/how-to/metrics-attributes.md
Original file line number Diff line number Diff line change
Expand Up @@ -418,34 +418,285 @@ class QualityEvaluator(Evaluator):
```


## Experiment-Level Metadata

In addition to case-level metadata, you can also pass experiment-level metadata when calling [`evaluate()`][pydantic_evals.Dataset.evaluate]:

```python
from pydantic_evals import Case, Dataset

dataset = Dataset(
cases=[
Case(
inputs='test',
metadata={'difficulty': 'easy'}, # Case-level metadata
)
]
)


async def task(inputs: str) -> str:
return f'Result: {inputs}'


# Pass experiment-level metadata
async def main():
report = await dataset.evaluate(
task,
metadata={
'model': 'gpt-4o',
'prompt_version': 'v2.1',
'temperature': 0.7,
},
)

# Access experiment metadata in the report
print(report.experiment_metadata)
#> {'model': 'gpt-4o', 'prompt_version': 'v2.1', 'temperature': 0.7}
```

### When to Use Experiment Metadata

Experiment metadata is useful for tracking configuration that applies to the entire evaluation run:

- **Model configuration**: Model name, version, parameters
- **Prompt versioning**: Which prompt template was used
- **Infrastructure**: Deployment environment, region
- **Experiment context**: Developer name, feature branch, commit hash

This metadata is especially valuable when:

- Comparing multiple evaluation runs over time
- Tracking which configuration produced which results
- Reproducing evaluation results from historical data

### Viewing in Reports

Experiment metadata appears at the top of printed reports:

```python
from pydantic_evals import Case, Dataset

dataset = Dataset(cases=[Case(inputs='hello', expected_output='HELLO')])


async def task(text: str) -> str:
return text.upper()

async def main():
report = await dataset.evaluate(
task,
metadata={'model': 'gpt-4o', 'version': 'v1.0'},
)

print(report.render())
"""
╭─ Evaluation Summary: task ─╮
│ model: gpt-4o │
│ version: v1.0 │
╰────────────────────────────╯
┏━━━━━━━━━━┳━━━━━━━━━━┓
┃ Case ID ┃ Duration ┃
┡━━━━━━━━━━╇━━━━━━━━━━┩
│ Case 1 │ 10ms │
├──────────┼──────────┤
│ Averages │ 10ms │
└──────────┴──────────┘
"""
```

## Synchronization between Tasks and Experiment Metadata

Experiment metadata is for *recording* configuration, not *configuring* the task.
The metadata dict doesn't automatically configure your task's behavior; you must ensure the values in the metadata dict match what your task actually uses.
For example, it's easy to accidentally have metadata claim `temperature: 0.7` while your task actually uses `temperature: 1.0`, leading to incorrect experiment tracking and unreproducible results.

To avoid this problem, we recommend establishing a single source of truth for configuration that both your task and metadata reference.
Below are a few suggested patterns for achieving this synchronization.

### Pattern 1: Shared Module Constants

For simpler cases, use module-level constants:

```python
from pydantic_ai import Agent
from pydantic_evals import Case, Dataset

# Module constants as single source of truth
MODEL_NAME = 'openai:gpt-5-mini'
TEMPERATURE = 0.7
SYSTEM_PROMPT = 'You are a helpful assistant.'

agent = Agent(MODEL_NAME, model_settings={'temperature': TEMPERATURE}, system_prompt=SYSTEM_PROMPT)


async def task(inputs: str) -> str:
result = await agent.run(inputs)
return result.output


async def main():
dataset = Dataset(cases=[Case(inputs='What is the capital of France?')])

# Metadata references same constants
await dataset.evaluate(
task,
metadata={
'model': MODEL_NAME,
'temperature': TEMPERATURE,
'system_prompt': SYSTEM_PROMPT,
},
)
```

### Pattern 2: Configuration Object (Recommended)

Define configuration once and use it everywhere:

```python
from dataclasses import asdict, dataclass

from pydantic_ai import Agent
from pydantic_evals import Case, Dataset


@dataclass
class TaskConfig:
"""Single source of truth for task configuration.

Includes all variables you'd like to see in experiment metadata.
"""

model: str
temperature: float
max_tokens: int
prompt_version: str


# Define configuration once
config = TaskConfig(
model='openai:gpt-5-mini',
temperature=0.7,
max_tokens=500,
prompt_version='v2.1',
)

# Use config in task
agent = Agent(
config.model,
model_settings={'temperature': config.temperature, 'max_tokens': config.max_tokens},
)


async def task(inputs: str) -> str:
"""Task uses the same config that's recorded in metadata."""
result = await agent.run(inputs)
return result.output


# Evaluate with metadata derived from the same config
async def main():
dataset = Dataset(cases=[Case(inputs='What is the capital of France?')])

report = await dataset.evaluate(
task,
metadata=asdict(config), # Guaranteed to match task behavior
)

print(report.experiment_metadata)
"""
{
'model': 'openai:gpt-5-mini',
'temperature': 0.7,
'max_tokens': 500,
'prompt_version': 'v2.1',
}
"""
```

If it's problematic to have a global task configuration, you can also create your `TaskConfig` object at the task
call-site and pass it to the agent via `deps` or similar, but in this case you would still need to guarantee that the
value is always the same as the value passed to `metadata` in the call to `Dataset.evaluate`.

### Anti-Pattern: Duplicate Configuration

**Avoid this common mistake**:

```python
from pydantic_ai import Agent
from pydantic_evals import Case, Dataset

# ❌ BAD: Configuration defined in multiple places
agent = Agent('openai:gpt-5-mini', model_settings={'temperature': 0.7})


async def task(inputs: str) -> str:
result = await agent.run(inputs)
return result.output


async def main():
dataset = Dataset(cases=[Case(inputs='test')])

# ❌ BAD: Metadata manually typed - easy to get out of sync
await dataset.evaluate(
task,
metadata={
'model': 'openai:gpt-5-mini', # Duplicated! Could diverge from agent definition
'temperature': 0.8, # ⚠️ WRONG! Task actually uses 0.7
},
)
```

In this anti-pattern, the metadata claims `temperature: 0.8` but the task uses `0.7`. This leads to:

- Incorrect experiment tracking
- Inability to reproduce results
- Confusion when comparing runs
- Wasted time debugging "why results differ"

## Metrics vs Attributes vs Metadata

Understanding the differences:

| Feature | Metrics | Attributes | Metadata |
|---------|---------|------------|----------|
| **Set in** | Task execution | Task execution | Case definition |
| **Type** | int, float | Any | Any |
| **Purpose** | Quantitative | Qualitative | Test data |
| **Used for** | Aggregation | Context | Input to task |
| **Available to** | Evaluators | Evaluators | Task & Evaluators |
| Feature | Metrics | Attributes | Case Metadata | Experiment Metadata |
|---------|---------|------------|---------------|---------------------|
| **Set in** | Task execution | Task execution | Case definition | `evaluate()` call |
| **Type** | int, float | Any | Any | Any |
| **Purpose** | Quantitative | Qualitative | Test data | Experiment config |
| **Used for** | Aggregation | Context | Input to task | Tracking runs |
| **Available to** | Evaluators | Evaluators | Task & Evaluators | Report only |
| **Scope** | Per case | Per case | Per case | Per experiment |

```python
from pydantic_evals import Case, increment_eval_metric, set_eval_attribute
from pydantic_evals import Case, Dataset, increment_eval_metric, set_eval_attribute

# Metadata: Defined in case (before execution)
Case(
# Case Metadata: Defined in case (before execution)
case = Case(
inputs='question',
metadata={'difficulty': 'hard', 'category': 'math'},
metadata={'difficulty': 'hard', 'category': 'math'}, # Per-case metadata
)

dataset = Dataset(cases=[case])


# Metrics & Attributes: Recorded during execution
def task(inputs):
# These are recorded during execution
async def task(inputs):
# These are recorded during execution for each case
increment_eval_metric('tokens', 100)
set_eval_attribute('model', 'gpt-4o')
return f'Result: {inputs}'


async def main():
# Experiment Metadata: Defined at evaluation time
await dataset.evaluate(
task,
metadata={ # Experiment-level metadata
'prompt_version': 'v2.1',
'temperature': 0.7,
},
)
```

## Troubleshooting
Expand Down
38 changes: 32 additions & 6 deletions pydantic_evals/pydantic_evals/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ async def evaluate(
retry_evaluators: RetryConfig | None = None,
*,
task_name: str | None = None,
metadata: dict[str, Any] | None = None,
tags: Sequence[str] | None = None,
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
"""Evaluates the test cases in the dataset using the given task.

Expand All @@ -283,6 +285,8 @@ async def evaluate(
retry_evaluators: Optional retry configuration for evaluator execution.
task_name: Optional override to the name of the task being executed, otherwise the name of the task
function will be used.
metadata: Optional dict of experiment metadata.
tags: Optional sequence of tags to add to the experiment span.

Returns:
A report containing the results of the evaluation.
Expand All @@ -294,14 +298,18 @@ async def evaluate(

limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()

extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
if metadata is not None:
extra_attributes['metadata'] = metadata
with (
logfire_span(
'evaluate {name}',
name=name,
task_name=task_name,
dataset_name=self.name,
n_cases=len(self.cases),
**{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType]
**extra_attributes,
_tags=tags,
) as eval_span,
progress_bar or nullcontext(),
):
Expand Down Expand Up @@ -339,13 +347,20 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
name=name,
cases=cases,
failures=failures,
experiment_metadata=metadata,
span_id=span_id,
trace_id=trace_id,
)
if (averages := report.averages()) is not None and averages.assertions is not None:
experiment_metadata = {'n_cases': len(self.cases), 'averages': averages}
eval_span.set_attribute('logfire.experiment.metadata', experiment_metadata)
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
if metadata is not None:
full_experiment_metadata['metadata'] = metadata
if tags is not None:
full_experiment_metadata['tags'] = tags
if (averages := report.averages()) is not None:
full_experiment_metadata['averages'] = averages
if averages.assertions is not None:
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
return report

def evaluate_sync(
Expand All @@ -356,6 +371,10 @@ def evaluate_sync(
progress: bool = True,
retry_task: RetryConfig | None = None,
retry_evaluators: RetryConfig | None = None,
*,
task_name: str | None = None,
metadata: dict[str, Any] | None = None,
tags: Sequence[str] | None = None,
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
"""Evaluates the test cases in the dataset using the given task.

Expand All @@ -371,18 +390,25 @@ def evaluate_sync(
progress: Whether to show a progress bar for the evaluation. Defaults to True.
retry_task: Optional retry configuration for the task execution.
retry_evaluators: Optional retry configuration for evaluator execution.
task_name: Optional override to the name of the task being executed, otherwise the name of the task
function will be used.
metadata: Optional dict of experiment metadata.
tags: Optional sequence of tags to add to the experiment span.

Returns:
A report containing the results of the evaluation.
"""
return get_event_loop().run_until_complete(
self.evaluate(
task,
task_name=name,
name=name,
max_concurrency=max_concurrency,
progress=progress,
retry_task=retry_task,
retry_evaluators=retry_evaluators,
task_name=task_name,
metadata=metadata,
tags=tags,
)
)

Expand Down
Loading
Loading