pydantic · dmontagu · Oct 28, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 27, 2025
diff --git a/docs/evals/how-to/metrics-attributes.md b/docs/evals/how-to/metrics-attributes.md
@@ -418,34 +418,285 @@ class QualityEvaluator(Evaluator):
 ```
 
 
+## Experiment-Level Metadata
+
+In addition to case-level metadata, you can also pass experiment-level metadata when calling [`evaluate()`][pydantic_evals.Dataset.evaluate]:
+
+```python
+from pydantic_evals import Case, Dataset
+
+dataset = Dataset(
+    cases=[
+        Case(
+            inputs='test',
+            metadata={'difficulty': 'easy'},  # Case-level metadata
+        )
+    ]
+)
+
+
+async def task(inputs: str) -> str:
+    return f'Result: {inputs}'
+
+
+# Pass experiment-level metadata
+async def main():
+    report = await dataset.evaluate(
+        task,
+        metadata={
+            'model': 'gpt-4o',
+            'prompt_version': 'v2.1',
+            'temperature': 0.7,
+        },
+    )
+
+    # Access experiment metadata in the report
+    print(report.experiment_metadata)
+    #> {'model': 'gpt-4o', 'prompt_version': 'v2.1', 'temperature': 0.7}
+```
+
+### When to Use Experiment Metadata
+
+Experiment metadata is useful for tracking configuration that applies to the entire evaluation run:
+
+- **Model configuration**: Model name, version, parameters
+- **Prompt versioning**: Which prompt template was used
+- **Infrastructure**: Deployment environment, region
+- **Experiment context**: Developer name, feature branch, commit hash
+
+This metadata is especially valuable when:
+
+- Comparing multiple evaluation runs over time
+- Tracking which configuration produced which results
+- Reproducing evaluation results from historical data
+
+### Viewing in Reports
+
+Experiment metadata appears at the top of printed reports:
+
+```python
+from pydantic_evals import Case, Dataset
+
+dataset = Dataset(cases=[Case(inputs='hello', expected_output='HELLO')])
+
+
+async def task(text: str) -> str:
+    return text.upper()
+
+async def main():
+    report = await dataset.evaluate(
+        task,
+        metadata={'model': 'gpt-4o', 'version': 'v1.0'},
+    )
+
+    print(report.render())
+    """
+    ╭─ Evaluation Summary: task ─╮
+    │ model: gpt-4o              │
+    │ version: v1.0              │
+    ╰────────────────────────────╯
+    ┏━━━━━━━━━━┳━━━━━━━━━━┓
+    ┃ Case ID  ┃ Duration ┃
+    ┡━━━━━━━━━━╇━━━━━━━━━━┩
+    │ Case 1   │     10ms │
+    ├──────────┼──────────┤
+    │ Averages │     10ms │
+    └──────────┴──────────┘
+    """
+```
+
+## Synchronization between Tasks and Experiment Metadata
+
+Experiment metadata is for *recording* configuration, not *configuring* the task.
+The metadata dict doesn't automatically configure your task's behavior; you must ensure the values in the metadata dict match what your task actually uses.
+For example, it's easy to accidentally have metadata claim `temperature: 0.7` while your task actually uses `temperature: 1.0`, leading to incorrect experiment tracking and unreproducible results.
+
+To avoid this problem, we recommend establishing a single source of truth for configuration that both your task and metadata reference.
+Below are a few suggested patterns for achieving this synchronization.
+
+### Pattern 1: Shared Module Constants
+
+For simpler cases, use module-level constants:
+
+```python
+from pydantic_ai import Agent
+from pydantic_evals import Case, Dataset
+
+# Module constants as single source of truth
+MODEL_NAME = 'openai:gpt-5-mini'
+TEMPERATURE = 0.7
+SYSTEM_PROMPT = 'You are a helpful assistant.'
+
+agent = Agent(MODEL_NAME, model_settings={'temperature': TEMPERATURE}, system_prompt=SYSTEM_PROMPT)
+
+
+async def task(inputs: str) -> str:
+    result = await agent.run(inputs)
+    return result.output
+
+
+async def main():
+    dataset = Dataset(cases=[Case(inputs='What is the capital of France?')])
+
+    # Metadata references same constants
+    await dataset.evaluate(
+        task,
+        metadata={
+            'model': MODEL_NAME,
+            'temperature': TEMPERATURE,
+            'system_prompt': SYSTEM_PROMPT,
+        },
+    )
+```
+
+### Pattern 2: Configuration Object (Recommended)
+
+Define configuration once and use it everywhere:
+
+```python
+from dataclasses import asdict, dataclass
+
+from pydantic_ai import Agent
+from pydantic_evals import Case, Dataset
+
+
+@dataclass
+class TaskConfig:
+    """Single source of truth for task configuration.
+
+    Includes all variables you'd like to see in experiment metadata.
+    """
+
+    model: str
+    temperature: float
+    max_tokens: int
+    prompt_version: str
+
+
+# Define configuration once
+config = TaskConfig(
+    model='openai:gpt-5-mini',
+    temperature=0.7,
+    max_tokens=500,
+    prompt_version='v2.1',
+)
+
+# Use config in task
+agent = Agent(
+    config.model,
+    model_settings={'temperature': config.temperature, 'max_tokens': config.max_tokens},
+)
+
+
+async def task(inputs: str) -> str:
+    """Task uses the same config that's recorded in metadata."""
+    result = await agent.run(inputs)
+    return result.output
+
+
+# Evaluate with metadata derived from the same config
+async def main():
+    dataset = Dataset(cases=[Case(inputs='What is the capital of France?')])
+
+    report = await dataset.evaluate(
+        task,
+        metadata=asdict(config),  # Guaranteed to match task behavior
+    )
+
+    print(report.experiment_metadata)
+    """
+    {
+        'model': 'openai:gpt-5-mini',
+        'temperature': 0.7,
+        'max_tokens': 500,
+        'prompt_version': 'v2.1',
+    }
+    """
+```
+
+If it's problematic to have a global task configuration, you can also create your `TaskConfig` object at the task
+call-site and pass it to the agent via `deps` or similar, but in this case you would still need to guarantee that the
+value is always the same as the value passed to `metadata` in the call to `Dataset.evaluate`.
+
+### Anti-Pattern: Duplicate Configuration
+
+**Avoid this common mistake**:
+
+```python
+from pydantic_ai import Agent
+from pydantic_evals import Case, Dataset
+
+# ❌ BAD: Configuration defined in multiple places
+agent = Agent('openai:gpt-5-mini', model_settings={'temperature': 0.7})
+
+
+async def task(inputs: str) -> str:
+    result = await agent.run(inputs)
+    return result.output
+
+
+async def main():
+    dataset = Dataset(cases=[Case(inputs='test')])
+
+    # ❌ BAD: Metadata manually typed - easy to get out of sync
+    await dataset.evaluate(
+        task,
+        metadata={
+            'model': 'openai:gpt-5-mini',  # Duplicated! Could diverge from agent definition
+            'temperature': 0.8,  # ⚠️ WRONG! Task actually uses 0.7
+        },
+    )
+```
+
+In this anti-pattern, the metadata claims `temperature: 0.8` but the task uses `0.7`. This leads to:
+
+- Incorrect experiment tracking
+- Inability to reproduce results
+- Confusion when comparing runs
+- Wasted time debugging "why results differ"
+
 ## Metrics vs Attributes vs Metadata
 
 Understanding the differences:
 
-| Feature | Metrics | Attributes | Metadata |
-|---------|---------|------------|----------|
-| **Set in** | Task execution | Task execution | Case definition |
-| **Type** | int, float | Any | Any |
-| **Purpose** | Quantitative | Qualitative | Test data |
-| **Used for** | Aggregation | Context | Input to task |
-| **Available to** | Evaluators | Evaluators | Task & Evaluators |
+| Feature | Metrics | Attributes | Case Metadata | Experiment Metadata |
+|---------|---------|------------|---------------|---------------------|
+| **Set in** | Task execution | Task execution | Case definition | `evaluate()` call |
+| **Type** | int, float | Any | Any | Any |
+| **Purpose** | Quantitative | Qualitative | Test data | Experiment config |
+| **Used for** | Aggregation | Context | Input to task | Tracking runs |
+| **Available to** | Evaluators | Evaluators | Task & Evaluators | Report only |
+| **Scope** | Per case | Per case | Per case | Per experiment |
 
 ```python
-from pydantic_evals import Case, increment_eval_metric, set_eval_attribute
+from pydantic_evals import Case, Dataset, increment_eval_metric, set_eval_attribute
 
-# Metadata: Defined in case (before execution)
-Case(
+# Case Metadata: Defined in case (before execution)
+case = Case(
     inputs='question',
-    metadata={'difficulty': 'hard', 'category': 'math'},
+    metadata={'difficulty': 'hard', 'category': 'math'},  # Per-case metadata
 )
 
+dataset = Dataset(cases=[case])
+
 
 # Metrics & Attributes: Recorded during execution
-def task(inputs):
-    # These are recorded during execution
+async def task(inputs):
+    # These are recorded during execution for each case
     increment_eval_metric('tokens', 100)
     set_eval_attribute('model', 'gpt-4o')
     return f'Result: {inputs}'
+
+
+async def main():
+    # Experiment Metadata: Defined at evaluation time
+    await dataset.evaluate(
+        task,
+        metadata={  # Experiment-level metadata
+            'prompt_version': 'v2.1',
+            'temperature': 0.7,
+        },
+    )
 ```
 
 ## Troubleshooting

diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -265,6 +265,8 @@ async def evaluate(
         retry_evaluators: RetryConfig | None = None,
         *,
         task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -283,6 +285,8 @@ async def evaluate(
             retry_evaluators: Optional retry configuration for evaluator execution.
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
+            metadata: Optional dict of experiment metadata.
+            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -294,14 +298,18 @@ async def evaluate(
 
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
 
+        extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
+        if metadata is not None:
+            extra_attributes['metadata'] = metadata
         with (
             logfire_span(
                 'evaluate {name}',
                 name=name,
                 task_name=task_name,
                 dataset_name=self.name,
                 n_cases=len(self.cases),
-                **{'gen_ai.operation.name': 'experiment'},  # pyright: ignore[reportArgumentType]
+                **extra_attributes,
+                _tags=tags,
             ) as eval_span,
             progress_bar or nullcontext(),
         ):
@@ -339,13 +347,20 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
                 name=name,
                 cases=cases,
                 failures=failures,
+                experiment_metadata=metadata,
                 span_id=span_id,
                 trace_id=trace_id,
             )
-            if (averages := report.averages()) is not None and averages.assertions is not None:
-                experiment_metadata = {'n_cases': len(self.cases), 'averages': averages}
-                eval_span.set_attribute('logfire.experiment.metadata', experiment_metadata)
-                eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
+            if metadata is not None:
+                full_experiment_metadata['metadata'] = metadata
+            if tags is not None:
+                full_experiment_metadata['tags'] = tags
+            if (averages := report.averages()) is not None:
+                full_experiment_metadata['averages'] = averages
+                if averages.assertions is not None:
+                    eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
         return report
 
     def evaluate_sync(
@@ -356,6 +371,10 @@ def evaluate_sync(
         progress: bool = True,
         retry_task: RetryConfig | None = None,
         retry_evaluators: RetryConfig | None = None,
+        *,
+        task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -371,18 +390,25 @@ def evaluate_sync(
             progress: Whether to show a progress bar for the evaluation. Defaults to True.
             retry_task: Optional retry configuration for the task execution.
             retry_evaluators: Optional retry configuration for evaluator execution.
+            task_name: Optional override to the name of the task being executed, otherwise the name of the task
+                function will be used.
+            metadata: Optional dict of experiment metadata.
+            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
         """
         return get_event_loop().run_until_complete(
             self.evaluate(
                 task,
-                task_name=name,
+                name=name,
                 max_concurrency=max_concurrency,
                 progress=progress,
                 retry_task=retry_task,
                 retry_evaluators=retry_evaluators,
+                task_name=task_name,
+                metadata=metadata,
+                tags=tags,
             )
         )