Skip to content

Commit 1e1a6d5

Browse files
dmontaguDouweM
andauthored
Expand docs for pydantic-evals (#3213)
Co-authored-by: Douwe Maan <[email protected]>
1 parent a253fad commit 1e1a6d5

File tree

20 files changed

+6723
-648
lines changed

20 files changed

+6723
-648
lines changed

docs/evals.md

Lines changed: 75 additions & 629 deletions
Large diffs are not rendered by default.

docs/evals/core-concepts.md

Lines changed: 493 additions & 0 deletions
Large diffs are not rendered by default.

docs/evals/evaluators/built-in.md

Lines changed: 456 additions & 0 deletions
Large diffs are not rendered by default.

docs/evals/evaluators/custom.md

Lines changed: 806 additions & 0 deletions
Large diffs are not rendered by default.

docs/evals/evaluators/llm-judge.md

Lines changed: 678 additions & 0 deletions
Large diffs are not rendered by default.

docs/evals/evaluators/overview.md

Lines changed: 438 additions & 0 deletions
Large diffs are not rendered by default.

docs/evals/evaluators/span-based.md

Lines changed: 567 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
# Example: Simple Validation
2+
3+
A proof of concept example of evaluating a simple text transformation function with deterministic checks.
4+
5+
## Scenario
6+
7+
We're testing a function that converts text to title case. We want to verify:
8+
9+
- Output is always a string
10+
- Output matches expected format
11+
- Function handles edge cases correctly
12+
- Performance meets requirements
13+
14+
## Complete Example
15+
16+
```python
17+
from pydantic_evals import Case, Dataset
18+
from pydantic_evals.evaluators import (
19+
Contains,
20+
EqualsExpected,
21+
IsInstance,
22+
MaxDuration,
23+
)
24+
25+
26+
# The function we're testing
27+
def to_title_case(text: str) -> str:
28+
"""Convert text to title case."""
29+
return text.title()
30+
31+
32+
# Create evaluation dataset
33+
dataset = Dataset(
34+
name='title_case_validation',
35+
cases=[
36+
# Basic functionality
37+
Case(
38+
name='basic_lowercase',
39+
inputs='hello world',
40+
expected_output='Hello World',
41+
),
42+
Case(
43+
name='basic_uppercase',
44+
inputs='HELLO WORLD',
45+
expected_output='Hello World',
46+
),
47+
Case(
48+
name='mixed_case',
49+
inputs='HeLLo WoRLd',
50+
expected_output='Hello World',
51+
),
52+
53+
# Edge cases
54+
Case(
55+
name='empty_string',
56+
inputs='',
57+
expected_output='',
58+
),
59+
Case(
60+
name='single_word',
61+
inputs='hello',
62+
expected_output='Hello',
63+
),
64+
Case(
65+
name='with_punctuation',
66+
inputs='hello, world!',
67+
expected_output='Hello, World!',
68+
),
69+
Case(
70+
name='with_numbers',
71+
inputs='hello 123 world',
72+
expected_output='Hello 123 World',
73+
),
74+
Case(
75+
name='apostrophes',
76+
inputs="don't stop believin'",
77+
expected_output="Don'T Stop Believin'",
78+
),
79+
],
80+
evaluators=[
81+
# Always returns a string
82+
IsInstance(type_name='str'),
83+
84+
# Matches expected output
85+
EqualsExpected(),
86+
87+
# Output should contain capital letters
88+
Contains(value='H', evaluation_name='has_capitals'),
89+
90+
# Should be fast (under 1ms)
91+
MaxDuration(seconds=0.001),
92+
],
93+
)
94+
95+
96+
# Run evaluation
97+
if __name__ == '__main__':
98+
report = dataset.evaluate_sync(to_title_case)
99+
100+
# Print results
101+
report.print(include_input=True, include_output=True)
102+
"""
103+
Evaluation Summary: to_title_case
104+
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
105+
┃ Case ID ┃ Inputs ┃ Outputs ┃ Assertions ┃ Duration ┃
106+
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
107+
│ basic_lowercase │ hello world │ Hello World │ ✔✔✔✗ │ 10ms │
108+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
109+
│ basic_uppercase │ HELLO WORLD │ Hello World │ ✔✔✔✗ │ 10ms │
110+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
111+
│ mixed_case │ HeLLo WoRLd │ Hello World │ ✔✔✔✗ │ 10ms │
112+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
113+
│ empty_string │ - │ - │ ✔✔✗✗ │ 10ms │
114+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
115+
│ single_word │ hello │ Hello │ ✔✔✔✗ │ 10ms │
116+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
117+
│ with_punctuation │ hello, world! │ Hello, World! │ ✔✔✔✗ │ 10ms │
118+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
119+
│ with_numbers │ hello 123 world │ Hello 123 World │ ✔✔✔✗ │ 10ms │
120+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
121+
│ apostrophes │ don't stop believin' │ Don'T Stop Believin' │ ✔✔✗✗ │ 10ms │
122+
├──────────────────┼──────────────────────┼──────────────────────┼────────────┼──────────┤
123+
│ Averages │ │ │ 68.8% ✔ │ 10ms │
124+
└──────────────────┴──────────────────────┴──────────────────────┴────────────┴──────────┘
125+
"""
126+
# Check if all passed
127+
avg = report.averages()
128+
if avg and avg.assertions == 1.0:
129+
print('\n✅ All tests passed!')
130+
else:
131+
print(f'\n❌ Some tests failed (pass rate: {avg.assertions:.1%})')
132+
"""
133+
❌ Some tests failed (pass rate: 68.8%)
134+
"""
135+
```
136+
137+
## Expected Output
138+
139+
```
140+
Evaluation Summary: to_title_case
141+
┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
142+
┃ Case ID ┃ Inputs ┃ Outputs ┃ Assertions ┃ Duration ┃
143+
┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
144+
│ basic_lowercase │ hello world │ Hello World │ ✔✔✔✔ │ <1ms│
145+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
146+
│ basic_uppercase │ HELLO WORLD │ Hello World │ ✔✔✔✔ │ <1ms│
147+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
148+
│ mixed_case │ HeLLo WoRLd │ Hello World │ ✔✔✔✔ │ <1ms│
149+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
150+
│ empty_string │ │ │ ✔✔✗✔ │ <1ms│
151+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
152+
│ single_word │ hello │ Hello │ ✔✔✔✔ │ <1ms│
153+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
154+
│ with_punctuation │ hello, world! │ Hello, World! │ ✔✔✔✔ │ <1ms│
155+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
156+
│ with_numbers │ hello 123 world │ Hello 123 World │ ✔✔✔✔ │ <1ms│
157+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
158+
│ apostrophes │ don't stop believin' │ Don'T Stop Believin' │ ✔✔✔✔ │ <1ms│
159+
├───────────────────┼──────────────────────┼───────────────────────┼────────────┼──────────┤
160+
│ Averages │ │ │ 96.9% ✔ │ <1ms│
161+
└───────────────────┴──────────────────────┴───────────────────────┴────────────┴──────────┘
162+
163+
✅ All tests passed!
164+
```
165+
166+
Note: The `empty_string` case has one failed assertion (`has_capitals`) because an empty string contains no capital letters.
167+
168+
## Saving and Loading
169+
170+
Save the dataset for future use:
171+
172+
```python {test="skip"}
173+
from typing import Any
174+
175+
from pydantic_evals import Case, Dataset
176+
from pydantic_evals.evaluators import EqualsExpected
177+
178+
179+
# The function we're testing
180+
def to_title_case(text: str) -> str:
181+
"""Convert text to title case."""
182+
return text.title()
183+
184+
185+
# Create dataset
186+
dataset: Dataset[str, str, Any] = Dataset(
187+
cases=[Case(inputs='test', expected_output='Test')],
188+
evaluators=[EqualsExpected()],
189+
)
190+
191+
# Save to YAML
192+
dataset.to_file('title_case_tests.yaml')
193+
194+
# Load later
195+
dataset = Dataset.from_file('title_case_tests.yaml')
196+
report = dataset.evaluate_sync(to_title_case)
197+
```
198+
199+
## Adding More Cases
200+
201+
As you find bugs or edge cases, add them to the dataset:
202+
203+
```python {test="skip"}
204+
from pydantic_evals import Dataset
205+
206+
# Load existing dataset
207+
dataset = Dataset.from_file('title_case_tests.yaml')
208+
209+
# Found a bug with unicode
210+
dataset.add_case(
211+
name='unicode_chars',
212+
inputs='café résumé',
213+
expected_output='Café Résumé',
214+
)
215+
216+
# Found a bug with all caps words
217+
dataset.add_case(
218+
name='acronyms',
219+
inputs='the USA and FBI',
220+
expected_output='The Usa And Fbi', # Python's title() behavior
221+
)
222+
223+
# Test with very long input
224+
dataset.add_case(
225+
name='long_input',
226+
inputs=' '.join(['word'] * 1000),
227+
expected_output=' '.join(['Word'] * 1000),
228+
)
229+
230+
# Save updated dataset
231+
dataset.to_file('title_case_tests.yaml')
232+
```
233+
234+
## Using with pytest
235+
236+
Integrate with pytest for CI/CD:
237+
238+
```python
239+
import pytest
240+
241+
from pydantic_evals import Dataset
242+
243+
244+
# The function we're testing
245+
def to_title_case(text: str) -> str:
246+
"""Convert text to title case."""
247+
return text.title()
248+
249+
250+
@pytest.fixture
251+
def title_case_dataset():
252+
return Dataset.from_file('title_case_tests.yaml')
253+
254+
255+
def test_title_case_evaluation(title_case_dataset):
256+
"""Run evaluation tests."""
257+
report = title_case_dataset.evaluate_sync(to_title_case)
258+
259+
# All cases should pass
260+
avg = report.averages()
261+
assert avg is not None
262+
assert avg.assertions == 1.0, f'Some tests failed (pass rate: {avg.assertions:.1%})'
263+
264+
265+
def test_title_case_performance(title_case_dataset):
266+
"""Verify performance."""
267+
report = title_case_dataset.evaluate_sync(to_title_case)
268+
269+
# All cases should complete quickly
270+
for case in report.cases:
271+
assert case.task_duration < 0.001, f'{case.name} took {case.task_duration}s'
272+
```
273+
274+
## Next Steps
275+
276+
- **[Built-in Evaluators](../evaluators/built-in.md)** - Explore all available evaluators
277+
- **[Custom Evaluators](../evaluators/custom.md)** - Write your own evaluation logic
278+
- **[Dataset Management](../how-to/dataset-management.md)** - Save, load, and manage datasets
279+
- **[Concurrency & Performance](../how-to/concurrency.md)** - Optimize evaluation performance

0 commit comments

Comments
 (0)