Skip to content

Commit a5a471e

Browse files
dmontaguKludex
andauthored
Add more examples and make various improvements to evals (#1323)
Co-authored-by: Marcelo Trylesinski <[email protected]>
1 parent 07e31e0 commit a5a471e

21 files changed

+1557
-390
lines changed

docs/evals.md

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ cases:
282282
dietary_restriction: vegetarian
283283
metadata:
284284
focus: vegetarian
285-
expected_output: null
286285
evaluators:
287286
- LLMJudge: Recipe should not contain meat or animal products
288287
- name: gluten_free_recipe
@@ -291,7 +290,6 @@ cases:
291290
dietary_restriction: gluten-free
292291
metadata:
293292
focus: gluten-free
294-
expected_output: null
295293
evaluators:
296294
- LLMJudge: Recipe should not contain gluten or wheat products
297295
evaluators:
@@ -537,6 +535,7 @@ from pathlib import Path
537535

538536
from pydantic import BaseModel, Field
539537

538+
from pydantic_evals import Dataset
540539
from pydantic_evals.generation import generate_dataset
541540

542541

@@ -569,9 +568,7 @@ class MetadataType(BaseModel, use_attribute_docstrings=True): # (3)!
569568

570569
async def main():
571570
dataset = await generate_dataset( # (4)!
572-
inputs_type=QuestionInputs,
573-
output_type=AnswerOutput,
574-
metadata_type=MetadataType,
571+
dataset_type=Dataset[QuestionInputs, AnswerOutput, MetadataType],
575572
n_examples=2,
576573
extra_instructions="""
577574
Generate question-answer pairs about world capitals and landmarks.
@@ -624,14 +621,13 @@ from pathlib import Path
624621

625622
from generate_dataset_example import AnswerOutput, MetadataType, QuestionInputs
626623

624+
from pydantic_evals import Dataset
627625
from pydantic_evals.generation import generate_dataset
628626

629627

630628
async def main():
631629
dataset = await generate_dataset( # (1)!
632-
inputs_type=QuestionInputs,
633-
output_type=AnswerOutput,
634-
metadata_type=MetadataType,
630+
dataset_type=Dataset[QuestionInputs, AnswerOutput, MetadataType],
635631
n_examples=2,
636632
extra_instructions="""
637633
Generate question-answer pairs about world capitals and landmarks.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from dataclasses import dataclass
2+
from datetime import timedelta
3+
4+
from pydantic_evals.evaluators import (
5+
Evaluator,
6+
EvaluatorContext,
7+
EvaluatorOutput,
8+
)
9+
from pydantic_evals.otel import SpanQuery
10+
11+
from pydantic_ai_examples.evals.models import (
12+
TimeRangeBuilderSuccess,
13+
TimeRangeInputs,
14+
TimeRangeResponse,
15+
)
16+
17+
18+
@dataclass
19+
class ValidateTimeRange(Evaluator[TimeRangeInputs, TimeRangeResponse]):
20+
def evaluate(
21+
self, ctx: EvaluatorContext[TimeRangeInputs, TimeRangeResponse]
22+
) -> EvaluatorOutput:
23+
if isinstance(ctx.output, TimeRangeBuilderSuccess):
24+
window_end = ctx.output.max_timestamp_with_offset
25+
window_size = window_end - ctx.output.min_timestamp_with_offset
26+
return {
27+
'window_is_not_too_long': window_size <= timedelta(days=30),
28+
'window_is_not_in_the_future': window_end <= ctx.inputs['now'],
29+
}
30+
31+
return {} # No evaluation needed for errors
32+
33+
34+
@dataclass
35+
class UserMessageIsConcise(Evaluator[TimeRangeInputs, TimeRangeResponse]):
36+
async def evaluate(
37+
self,
38+
ctx: EvaluatorContext[TimeRangeInputs, TimeRangeResponse],
39+
) -> EvaluatorOutput:
40+
if isinstance(ctx.output, TimeRangeBuilderSuccess):
41+
user_facing_message = ctx.output.explanation
42+
else:
43+
user_facing_message = ctx.output.error_message
44+
45+
if user_facing_message is not None:
46+
return len(user_facing_message.split()) < 50
47+
else:
48+
return {}
49+
50+
51+
@dataclass
52+
class AgentCalledTool(Evaluator[object, object, object]):
53+
agent_name: str
54+
tool_name: str
55+
56+
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> bool:
57+
return ctx.span_tree.any(
58+
SpanQuery(
59+
name_equals='agent run',
60+
has_attributes={'agent_name': self.agent_name},
61+
stop_recursing_when=SpanQuery(name_equals='agent run'),
62+
some_descendant_has=SpanQuery(
63+
name_equals='running tool',
64+
has_attributes={'gen_ai.tool.name': self.tool_name},
65+
),
66+
)
67+
)
68+
69+
70+
CUSTOM_EVALUATOR_TYPES = (ValidateTimeRange, UserMessageIsConcise, AgentCalledTool)
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# yaml-language-server: $schema=time_range_v1_schema.json
2+
cases:
3+
- name: Single day mention
4+
inputs:
5+
prompt: I want to see logs from 2021-05-08
6+
now: '2023-10-28T09:30:00Z'
7+
expected_output:
8+
min_timestamp_with_offset: '2021-05-08T00:00:00Z'
9+
max_timestamp_with_offset: '2021-05-08T23:59:59Z'
10+
explanation: You mentioned a single day (2021-05-08). The entire day is used.
11+
evaluators:
12+
- IsInstance: TimeRangeBuilderSuccess
13+
- name: Ambiguous mention
14+
inputs:
15+
prompt: Check logs from last week or so, around early May
16+
now: '2023-10-28T09:30:00Z'
17+
expected_output:
18+
min_timestamp_with_offset: '2023-10-21T09:30:00Z'
19+
max_timestamp_with_offset: '2023-10-28T09:30:00Z'
20+
explanation: We interpret the mention of early May as extraneous, focusing on
21+
'last week or so' from the current time.
22+
evaluators:
23+
- IsInstance: TimeRangeBuilderSuccess
24+
- LLMJudge: We want to interpret conflicting references by default to the more recent
25+
timeframe; confirm the explanation addresses ignoring early May.
26+
- name: Single datetime mention
27+
inputs:
28+
prompt: Show me the logs at 2023-10-27 2:00pm
29+
now: '2023-10-28T09:30:00Z'
30+
expected_output:
31+
min_timestamp_with_offset: '2023-10-27T13:50:00Z'
32+
max_timestamp_with_offset: '2023-10-27T14:10:00Z'
33+
explanation: You only mentioned a single point in time, so a 10-minute window
34+
around that time is used.
35+
evaluators:
36+
- IsInstance: TimeRangeBuilderSuccess
37+
- name: Relative mention without date
38+
inputs:
39+
prompt: Check logs from 2 hours ago
40+
now: '2023-10-28T09:30:00Z'
41+
expected_output:
42+
min_timestamp_with_offset: '2023-10-28T07:30:00Z'
43+
max_timestamp_with_offset: '2023-10-28T09:30:00Z'
44+
explanation: You requested logs starting from 2 hours prior to the current time.
45+
evaluators:
46+
- IsInstance: TimeRangeBuilderSuccess
47+
- name: Impossible range
48+
inputs:
49+
prompt: Check logs from 2025, but make sure they are also from 2020
50+
now: '2023-10-28T09:30:00Z'
51+
expected_output:
52+
error_message: 'Conflicting time instructions: 2025 and 2020 cannot both apply.'
53+
evaluators:
54+
- IsInstance: TimeRangeBuilderError
55+
- name: No mention
56+
inputs:
57+
prompt: Show me some logs
58+
now: '2023-10-28T09:30:00Z'
59+
expected_output:
60+
error_message: No timeframe could be inferred from your request.
61+
evaluators:
62+
- IsInstance: TimeRangeBuilderError
63+
- name: Ambiguous elliptical mention
64+
inputs:
65+
prompt: Check logs from around the start of last quarter
66+
now: '2023-07-15T08:00:00Z'
67+
expected_output:
68+
min_timestamp_with_offset: '2023-04-01T00:00:00Z'
69+
max_timestamp_with_offset: '2023-04-05T23:59:59Z'
70+
explanation: We interpret 'around the start of last quarter' as the first few
71+
days of Q2 2023.
72+
evaluators:
73+
- IsInstance: TimeRangeBuilderSuccess
74+
- name: Far future mention
75+
inputs:
76+
prompt: Check logs from January 3050
77+
now: '2023-10-28T09:30:00Z'
78+
expected_output:
79+
min_timestamp_with_offset: '3050-01-01T00:00:00Z'
80+
max_timestamp_with_offset: '3050-01-31T23:59:59Z'
81+
explanation: You requested logs from January 3050. The entire month is used.
82+
evaluators:
83+
- IsInstance: TimeRangeBuilderSuccess
84+
- name: Confusing relative references
85+
inputs:
86+
prompt: Check logs from yesterday but also last year
87+
now: '2023-10-28T09:30:00Z'
88+
expected_output:
89+
error_message: 'Conflicting instructions: ''yesterday'' versus ''last year'' could
90+
not be reconciled.'
91+
evaluators:
92+
- IsInstance: TimeRangeBuilderError
93+
- name: Range from speech
94+
inputs:
95+
prompt: I want the logs from December 25th to December 26th, so I can see what
96+
happened on Christmas day. But also it might be earlier.
97+
now: '2023-10-28T09:30:00Z'
98+
expected_output:
99+
min_timestamp_with_offset: '2023-12-25T00:00:00Z'
100+
max_timestamp_with_offset: '2023-12-26T23:59:59Z'
101+
explanation: You asked specifically for December 25th to December 26th. The mention
102+
of an earlier date is ignored since a range was provided.
103+
evaluators:
104+
- IsInstance: TimeRangeBuilderSuccess
105+
evaluators:
106+
- LLMJudge: Ensure the explanation or error_message fields are truly appropriate for
107+
user display, in a second-person or friendly style.

0 commit comments

Comments
 (0)