@@ -418,34 +418,285 @@ class QualityEvaluator(Evaluator):
418418```
419419
420420
421+ ## Experiment-Level Metadata
422+
423+ In addition to case-level metadata, you can also pass experiment-level metadata when calling [ ` evaluate() ` ] [ pydantic_evals.Dataset.evaluate ] :
424+
425+ ``` python
426+ from pydantic_evals import Case, Dataset
427+
428+ dataset = Dataset(
429+ cases = [
430+ Case(
431+ inputs = ' test' ,
432+ metadata = {' difficulty' : ' easy' }, # Case-level metadata
433+ )
434+ ]
435+ )
436+
437+
438+ async def task (inputs : str ) -> str :
439+ return f ' Result: { inputs} '
440+
441+
442+ # Pass experiment-level metadata
443+ async def main ():
444+ report = await dataset.evaluate(
445+ task,
446+ metadata = {
447+ ' model' : ' gpt-4o' ,
448+ ' prompt_version' : ' v2.1' ,
449+ ' temperature' : 0.7 ,
450+ },
451+ )
452+
453+ # Access experiment metadata in the report
454+ print (report.experiment_metadata)
455+ # > {'model': 'gpt-4o', 'prompt_version': 'v2.1', 'temperature': 0.7}
456+ ```
457+
458+ ### When to Use Experiment Metadata
459+
460+ Experiment metadata is useful for tracking configuration that applies to the entire evaluation run:
461+
462+ - ** Model configuration** : Model name, version, parameters
463+ - ** Prompt versioning** : Which prompt template was used
464+ - ** Infrastructure** : Deployment environment, region
465+ - ** Experiment context** : Developer name, feature branch, commit hash
466+
467+ This metadata is especially valuable when:
468+
469+ - Comparing multiple evaluation runs over time
470+ - Tracking which configuration produced which results
471+ - Reproducing evaluation results from historical data
472+
473+ ### Viewing in Reports
474+
475+ Experiment metadata appears at the top of printed reports:
476+
477+ ``` python
478+ from pydantic_evals import Case, Dataset
479+
480+ dataset = Dataset(cases = [Case(inputs = ' hello' , expected_output = ' HELLO' )])
481+
482+
483+ async def task (text : str ) -> str :
484+ return text.upper()
485+
486+ async def main ():
487+ report = await dataset.evaluate(
488+ task,
489+ metadata = {' model' : ' gpt-4o' , ' version' : ' v1.0' },
490+ )
491+
492+ print (report.render())
493+ """
494+ ╭─ Evaluation Summary: task ─╮
495+ │ model: gpt-4o │
496+ │ version: v1.0 │
497+ ╰────────────────────────────╯
498+ ┏━━━━━━━━━━┳━━━━━━━━━━┓
499+ ┃ Case ID ┃ Duration ┃
500+ ┡━━━━━━━━━━╇━━━━━━━━━━┩
501+ │ Case 1 │ 10ms │
502+ ├──────────┼──────────┤
503+ │ Averages │ 10ms │
504+ └──────────┴──────────┘
505+ """
506+ ```
507+
508+ ## Synchronization between Tasks and Experiment Metadata
509+
510+ Experiment metadata is for * recording* configuration, not * configuring* the task.
511+ The metadata dict doesn't automatically configure your task's behavior; you must ensure the values in the metadata dict match what your task actually uses.
512+ For example, it's easy to accidentally have metadata claim ` temperature: 0.7 ` while your task actually uses ` temperature: 1.0 ` , leading to incorrect experiment tracking and unreproducible results.
513+
514+ To avoid this problem, we recommend establishing a single source of truth for configuration that both your task and metadata reference.
515+ Below are a few suggested patterns for achieving this synchronization.
516+
517+ ### Pattern 1: Shared Module Constants
518+
519+ For simpler cases, use module-level constants:
520+
521+ ``` python
522+ from pydantic_ai import Agent
523+ from pydantic_evals import Case, Dataset
524+
525+ # Module constants as single source of truth
526+ MODEL_NAME = ' openai:gpt-5-mini'
527+ TEMPERATURE = 0.7
528+ SYSTEM_PROMPT = ' You are a helpful assistant.'
529+
530+ agent = Agent(MODEL_NAME , model_settings = {' temperature' : TEMPERATURE }, system_prompt = SYSTEM_PROMPT )
531+
532+
533+ async def task (inputs : str ) -> str :
534+ result = await agent.run(inputs)
535+ return result.output
536+
537+
538+ async def main ():
539+ dataset = Dataset(cases = [Case(inputs = ' What is the capital of France?' )])
540+
541+ # Metadata references same constants
542+ await dataset.evaluate(
543+ task,
544+ metadata = {
545+ ' model' : MODEL_NAME ,
546+ ' temperature' : TEMPERATURE ,
547+ ' system_prompt' : SYSTEM_PROMPT ,
548+ },
549+ )
550+ ```
551+
552+ ### Pattern 2: Configuration Object (Recommended)
553+
554+ Define configuration once and use it everywhere:
555+
556+ ``` python
557+ from dataclasses import asdict, dataclass
558+
559+ from pydantic_ai import Agent
560+ from pydantic_evals import Case, Dataset
561+
562+
563+ @dataclass
564+ class TaskConfig :
565+ """ Single source of truth for task configuration.
566+
567+ Includes all variables you'd like to see in experiment metadata.
568+ """
569+
570+ model: str
571+ temperature: float
572+ max_tokens: int
573+ prompt_version: str
574+
575+
576+ # Define configuration once
577+ config = TaskConfig(
578+ model = ' openai:gpt-5-mini' ,
579+ temperature = 0.7 ,
580+ max_tokens = 500 ,
581+ prompt_version = ' v2.1' ,
582+ )
583+
584+ # Use config in task
585+ agent = Agent(
586+ config.model,
587+ model_settings = {' temperature' : config.temperature, ' max_tokens' : config.max_tokens},
588+ )
589+
590+
591+ async def task (inputs : str ) -> str :
592+ """ Task uses the same config that's recorded in metadata."""
593+ result = await agent.run(inputs)
594+ return result.output
595+
596+
597+ # Evaluate with metadata derived from the same config
598+ async def main ():
599+ dataset = Dataset(cases = [Case(inputs = ' What is the capital of France?' )])
600+
601+ report = await dataset.evaluate(
602+ task,
603+ metadata = asdict(config), # Guaranteed to match task behavior
604+ )
605+
606+ print (report.experiment_metadata)
607+ """
608+ {
609+ 'model': 'openai:gpt-5-mini',
610+ 'temperature': 0.7,
611+ 'max_tokens': 500,
612+ 'prompt_version': 'v2.1',
613+ }
614+ """
615+ ```
616+
617+ If it's problematic to have a global task configuration, you can also create your ` TaskConfig ` object at the task
618+ call-site and pass it to the agent via ` deps ` or similar, but in this case you would still need to guarantee that the
619+ value is always the same as the value passed to ` metadata ` in the call to ` Dataset.evaluate ` .
620+
621+ ### Anti-Pattern: Duplicate Configuration
622+
623+ ** Avoid this common mistake** :
624+
625+ ``` python
626+ from pydantic_ai import Agent
627+ from pydantic_evals import Case, Dataset
628+
629+ # ❌ BAD: Configuration defined in multiple places
630+ agent = Agent(' openai:gpt-5-mini' , model_settings = {' temperature' : 0.7 })
631+
632+
633+ async def task (inputs : str ) -> str :
634+ result = await agent.run(inputs)
635+ return result.output
636+
637+
638+ async def main ():
639+ dataset = Dataset(cases = [Case(inputs = ' test' )])
640+
641+ # ❌ BAD: Metadata manually typed - easy to get out of sync
642+ await dataset.evaluate(
643+ task,
644+ metadata = {
645+ ' model' : ' openai:gpt-5-mini' , # Duplicated! Could diverge from agent definition
646+ ' temperature' : 0.8 , # ⚠️ WRONG! Task actually uses 0.7
647+ },
648+ )
649+ ```
650+
651+ In this anti-pattern, the metadata claims ` temperature: 0.8 ` but the task uses ` 0.7 ` . This leads to:
652+
653+ - Incorrect experiment tracking
654+ - Inability to reproduce results
655+ - Confusion when comparing runs
656+ - Wasted time debugging "why results differ"
657+
421658## Metrics vs Attributes vs Metadata
422659
423660Understanding the differences:
424661
425- | Feature | Metrics | Attributes | Metadata |
426- | ---------| ---------| ------------| ----------|
427- | ** Set in** | Task execution | Task execution | Case definition |
428- | ** Type** | int, float | Any | Any |
429- | ** Purpose** | Quantitative | Qualitative | Test data |
430- | ** Used for** | Aggregation | Context | Input to task |
431- | ** Available to** | Evaluators | Evaluators | Task & Evaluators |
662+ | Feature | Metrics | Attributes | Case Metadata | Experiment Metadata |
663+ | ---------| ---------| ------------| ---------------| ---------------------|
664+ | ** Set in** | Task execution | Task execution | Case definition | ` evaluate() ` call |
665+ | ** Type** | int, float | Any | Any | Any |
666+ | ** Purpose** | Quantitative | Qualitative | Test data | Experiment config |
667+ | ** Used for** | Aggregation | Context | Input to task | Tracking runs |
668+ | ** Available to** | Evaluators | Evaluators | Task & Evaluators | Report only |
669+ | ** Scope** | Per case | Per case | Per case | Per experiment |
432670
433671``` python
434- from pydantic_evals import Case, increment_eval_metric, set_eval_attribute
672+ from pydantic_evals import Case, Dataset, increment_eval_metric, set_eval_attribute
435673
436- # Metadata: Defined in case (before execution)
437- Case(
674+ # Case Metadata: Defined in case (before execution)
675+ case = Case(
438676 inputs = ' question' ,
439- metadata = {' difficulty' : ' hard' , ' category' : ' math' },
677+ metadata = {' difficulty' : ' hard' , ' category' : ' math' }, # Per-case metadata
440678)
441679
680+ dataset = Dataset(cases = [case])
681+
442682
443683# Metrics & Attributes: Recorded during execution
444- def task (inputs ):
445- # These are recorded during execution
684+ async def task (inputs ):
685+ # These are recorded during execution for each case
446686 increment_eval_metric(' tokens' , 100 )
447687 set_eval_attribute(' model' , ' gpt-4o' )
448688 return f ' Result: { inputs} '
689+
690+
691+ async def main ():
692+ # Experiment Metadata: Defined at evaluation time
693+ await dataset.evaluate(
694+ task,
695+ metadata = { # Experiment-level metadata
696+ ' prompt_version' : ' v2.1' ,
697+ ' temperature' : 0.7 ,
698+ },
699+ )
449700```
450701
451702## Troubleshooting
0 commit comments