pydantic
diff --git a/‎docs/evals.md‎
Lines changed: 4 additions & 8 deletions b/‎docs/evals.md‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎examples/pydantic_ai_examples/evals/custom_evaluators.py‎
Lines changed: 70 additions & 0 deletions b/‎examples/pydantic_ai_examples/evals/custom_evaluators.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎examples/pydantic_ai_examples/evals/datasets/time_range_v1.yaml‎
Lines changed: 107 additions & 0 deletions b/‎examples/pydantic_ai_examples/evals/datasets/time_range_v1.yaml‎
Lines changed: 107 additions & 0 deletions
@@ -282,7 +282,6 @@ cases:
     dietary_restriction: vegetarian
   metadata:
     focus: vegetarian
-  expected_output: null
   evaluators:
   - LLMJudge: Recipe should not contain meat or animal products
 - name: gluten_free_recipe
@@ -291,7 +290,6 @@ cases:
     dietary_restriction: gluten-free
   metadata:
     focus: gluten-free
-  expected_output: null
   evaluators:
   - LLMJudge: Recipe should not contain gluten or wheat products
 evaluators:
@@ -537,6 +535,7 @@ from pathlib import Path
 
 from pydantic import BaseModel, Field
 
+from pydantic_evals import Dataset
 from pydantic_evals.generation import generate_dataset
 
 
@@ -569,9 +568,7 @@ class MetadataType(BaseModel, use_attribute_docstrings=True):  # (3)!
 
 async def main():
     dataset = await generate_dataset(  # (4)!
-        inputs_type=QuestionInputs,
-        output_type=AnswerOutput,
-        metadata_type=MetadataType,
+        dataset_type=Dataset[QuestionInputs, AnswerOutput, MetadataType],
         n_examples=2,
         extra_instructions="""
         Generate question-answer pairs about world capitals and landmarks.
@@ -624,14 +621,13 @@ from pathlib import Path
 
 from generate_dataset_example import AnswerOutput, MetadataType, QuestionInputs
 
+from pydantic_evals import Dataset
 from pydantic_evals.generation import generate_dataset
 
 
 async def main():
     dataset = await generate_dataset(  # (1)!
-        inputs_type=QuestionInputs,
-        output_type=AnswerOutput,
-        metadata_type=MetadataType,
+        dataset_type=Dataset[QuestionInputs, AnswerOutput, MetadataType],
         n_examples=2,
         extra_instructions="""
         Generate question-answer pairs about world capitals and landmarks.
 
@@ -0,0 +1,70 @@
+from dataclasses import dataclass
+from datetime import timedelta
+
+from pydantic_evals.evaluators import (
+    Evaluator,
+    EvaluatorContext,
+    EvaluatorOutput,
+)
+from pydantic_evals.otel import SpanQuery
+
+from pydantic_ai_examples.evals.models import (
+    TimeRangeBuilderSuccess,
+    TimeRangeInputs,
+    TimeRangeResponse,
+)
+
+
+@dataclass
+class ValidateTimeRange(Evaluator[TimeRangeInputs, TimeRangeResponse]):
+    def evaluate(
+        self, ctx: EvaluatorContext[TimeRangeInputs, TimeRangeResponse]
+    ) -> EvaluatorOutput:
+        if isinstance(ctx.output, TimeRangeBuilderSuccess):
+            window_end = ctx.output.max_timestamp_with_offset
+            window_size = window_end - ctx.output.min_timestamp_with_offset
+            return {
+                'window_is_not_too_long': window_size <= timedelta(days=30),
+                'window_is_not_in_the_future': window_end <= ctx.inputs['now'],
+            }
+
+        return {}  # No evaluation needed for errors
+
+
+@dataclass
+class UserMessageIsConcise(Evaluator[TimeRangeInputs, TimeRangeResponse]):
+    async def evaluate(
+        self,
+        ctx: EvaluatorContext[TimeRangeInputs, TimeRangeResponse],
+    ) -> EvaluatorOutput:
+        if isinstance(ctx.output, TimeRangeBuilderSuccess):
+            user_facing_message = ctx.output.explanation
+        else:
+            user_facing_message = ctx.output.error_message
+
+        if user_facing_message is not None:
+            return len(user_facing_message.split()) < 50
+        else:
+            return {}
+
+
+@dataclass
+class AgentCalledTool(Evaluator[object, object, object]):
+    agent_name: str
+    tool_name: str
+
+    def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool:
+        return ctx.span_tree.any(
+            SpanQuery(
+                name_equals='agent run',
+                has_attributes={'agent_name': self.agent_name},
+                stop_recursing_when=SpanQuery(name_equals='agent run'),
+                some_descendant_has=SpanQuery(
+                    name_equals='running tool',
+                    has_attributes={'gen_ai.tool.name': self.tool_name},
+                ),
+            )
+        )
+
+
+CUSTOM_EVALUATOR_TYPES = (ValidateTimeRange, UserMessageIsConcise, AgentCalledTool)
@@ -0,0 +1,107 @@
+# yaml-language-server: $schema=time_range_v1_schema.json
+cases:
+- name: Single day mention
+  inputs:
+    prompt: I want to see logs from 2021-05-08
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    min_timestamp_with_offset: '2021-05-08T00:00:00Z'
+    max_timestamp_with_offset: '2021-05-08T23:59:59Z'
+    explanation: You mentioned a single day (2021-05-08). The entire day is used.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+- name: Ambiguous mention
+  inputs:
+    prompt: Check logs from last week or so, around early May
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    min_timestamp_with_offset: '2023-10-21T09:30:00Z'
+    max_timestamp_with_offset: '2023-10-28T09:30:00Z'
+    explanation: We interpret the mention of early May as extraneous, focusing on
+      'last week or so' from the current time.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+  - LLMJudge: We want to interpret conflicting references by default to the more recent
+      timeframe; confirm the explanation addresses ignoring early May.
+- name: Single datetime mention
+  inputs:
+    prompt: Show me the logs at 2023-10-27 2:00pm
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    min_timestamp_with_offset: '2023-10-27T13:50:00Z'
+    max_timestamp_with_offset: '2023-10-27T14:10:00Z'
+    explanation: You only mentioned a single point in time, so a 10-minute window
+      around that time is used.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+- name: Relative mention without date
+  inputs:
+    prompt: Check logs from 2 hours ago
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    min_timestamp_with_offset: '2023-10-28T07:30:00Z'
+    max_timestamp_with_offset: '2023-10-28T09:30:00Z'
+    explanation: You requested logs starting from 2 hours prior to the current time.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+- name: Impossible range
+  inputs:
+    prompt: Check logs from 2025, but make sure they are also from 2020
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    error_message: 'Conflicting time instructions: 2025 and 2020 cannot both apply.'
+  evaluators:
+  - IsInstance: TimeRangeBuilderError
+- name: No mention
+  inputs:
+    prompt: Show me some logs
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    error_message: No timeframe could be inferred from your request.
+  evaluators:
+  - IsInstance: TimeRangeBuilderError
+- name: Ambiguous elliptical mention
+  inputs:
+    prompt: Check logs from around the start of last quarter
+    now: '2023-07-15T08:00:00Z'
+  expected_output:
+    min_timestamp_with_offset: '2023-04-01T00:00:00Z'
+    max_timestamp_with_offset: '2023-04-05T23:59:59Z'
+    explanation: We interpret 'around the start of last quarter' as the first few
+      days of Q2 2023.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+- name: Far future mention
+  inputs:
+    prompt: Check logs from January 3050
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    min_timestamp_with_offset: '3050-01-01T00:00:00Z'
+    max_timestamp_with_offset: '3050-01-31T23:59:59Z'
+    explanation: You requested logs from January 3050. The entire month is used.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+- name: Confusing relative references
+  inputs:
+    prompt: Check logs from yesterday but also last year
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    error_message: 'Conflicting instructions: ''yesterday'' versus ''last year'' could
+      not be reconciled.'
+  evaluators:
+  - IsInstance: TimeRangeBuilderError
+- name: Range from speech
+  inputs:
+    prompt: I want the logs from December 25th to December 26th, so I can see what
+      happened on Christmas day. But also it might be earlier.
+    now: '2023-10-28T09:30:00Z'
+  expected_output:
+    min_timestamp_with_offset: '2023-12-25T00:00:00Z'
+    max_timestamp_with_offset: '2023-12-26T23:59:59Z'
+    explanation: You asked specifically for December 25th to December 26th. The mention
+      of an earlier date is ignored since a range was provided.
+  evaluators:
+  - IsInstance: TimeRangeBuilderSuccess
+evaluators:
+- LLMJudge: Ensure the explanation or error_message fields are truly appropriate for
+    user display, in a second-person or friendly style.