Skip to content

Commit 4c8a2e6

Browse files
committed
add EvalGame
1 parent faec697 commit 4c8a2e6

File tree

4 files changed

+262
-12
lines changed

4 files changed

+262
-12
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,100 @@
11
from __future__ import annotations as _annotations
22

3+
import textwrap
4+
from enum import Enum
5+
36
from pydantic import BaseModel, Field
47

8+
from pydantic_ai import Agent
9+
from pydantic_ai.settings import ModelSettings
10+
11+
EVALUATION_INSTRUCTIONS = """
12+
You are presented with a question and two possible answers A and B. Evaluate carefully whether answer A or answer B is the better reply.
13+
You have got only these two options. Your evaluations contribute to Bradley-Terry scores across multiple items. Consistency and
14+
objectivity are critical for reliable rankings. Each comparison should be independent but internally consistent.
15+
16+
<EXAMPLES>
17+
Example 1:
18+
<QUESTION> Which of the two ice cream flavours below is more creative? </QUESTION>
19+
<A> Vanilla </A>
20+
<B> Pickled Citrus Ribbon </B>
21+
Expected output:
22+
{
23+
"result": "B",
24+
}
25+
26+
Example 2:
27+
<QUESTION> Which search query shows more genuine curiosity? </QUESTION>
28+
<A> effect of ocean acidification feedback loops on Arctic methane release </A>
29+
<B> climate change effects </B>
30+
Expected output:
31+
{
32+
"result": "A",
33+
}
34+
35+
Example 3:
36+
<QUESTION> Which reply is more insulting? </QUESTION>
37+
<A> Your argument lacks logical coherence and fails to address the core issue at hand. </A>
38+
<B> That's an interesting perspective, though I see it differently. </B>
39+
Expected output:
40+
{
41+
"result": "A",
42+
}
43+
</EXAMPLES>
44+
45+
<REQUIREMENTS>
46+
1. Consider the question carefully. What aspects are important for the answer?
47+
2. Think about answer A. Is it a good answer to the question? Why (not)?
48+
3. Think about answer B. Is it a good answer to the question? Why (not)?
49+
4. Make a decision based on your analysis.
50+
</REQUIREMENTS>
51+
52+
<OUTPUT_FORMAT>
53+
You must respond with valid JSON containing exactly one field called "response" with value "A" or "B":
54+
55+
{
56+
"response": "A",
57+
}
58+
or
59+
{
60+
"response": "B",
61+
}
62+
63+
Do NOT include explanations, reasoning, or any other fields.
64+
</OUTPUT_FORMAT>
65+
"""
566

667
class EvalPlayer(BaseModel):
768
"""Player in a Bradley-Terry tournament."""
869
idx: int = Field(..., description='unique identifier for the player')
970
item: str = Field(..., description='item to be scored')
1071
score: float | None = Field(default=None, description='Bradley-Terry strength score for the item')
72+
73+
74+
class GameResult(str, Enum):
75+
"""Possible results of an evaluation game."""
76+
A = 'A'
77+
B = 'B'
78+
79+
80+
class EvalGame(BaseModel):
81+
"""Represents a game between two players in the evaluation tournament."""
82+
criterion: str = Field(..., description='evaluation criterion on which players should be judged')
83+
84+
async def run(self, players: tuple[EvalPlayer, EvalPlayer], agent: Agent[None, GameResult], model_settings: ModelSettings) -> tuple[int, int]:
85+
prompt = textwrap.dedent(f"""
86+
<QUESTION> {self.criterion} </QUESTION>
87+
<A> {players[0].item} </A>
88+
<B> {players[1].item} </B>
89+
""")
90+
91+
async with agent:
92+
result = await agent.run(
93+
user_prompt=prompt,
94+
model_settings=model_settings,
95+
)
96+
97+
if result.output == GameResult.A:
98+
return (players[0].idx, players[1].idx)
99+
else:
100+
return (players[1].idx, players[0].idx)

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ def mock_vcr_aiohttp_content(mocker: MockerFixture):
296296
@pytest.fixture(scope='module')
297297
def vcr_config():
298298
return {
299-
'ignore_localhost': True,
299+
'ignore_localhost': False,
300300
# Note: additional header filtering is done inside the serializer
301301
'filter_headers': ['authorization', 'x-api-key'],
302302
'decode_compressed_response': True,
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
interactions:
2+
- request:
3+
headers:
4+
accept:
5+
- application/json
6+
accept-encoding:
7+
- gzip, deflate
8+
connection:
9+
- keep-alive
10+
content-length:
11+
- '2432'
12+
content-type:
13+
- application/json
14+
host:
15+
- localhost:11434
16+
method: POST
17+
parsed_body:
18+
messages:
19+
- content: "\nYou are presented with a question and two possible answers A and B. Evaluate carefully whether answer
20+
A or answer B is the better reply.\nYou have got only these two options. Your evaluations contribute to Bradley-Terry
21+
scores across multiple items. Consistency and\nobjectivity are critical for reliable rankings. Each comparison should
22+
be independent but internally consistent.\n\n<EXAMPLES>\nExample 1:\n<QUESTION> Which of the two ice cream flavours
23+
below is more creative? </QUESTION>\n<A> Vanilla </A> \n<B> Pickled Citrus Ribbon </B>\nExpected output:\n{\n \"result\":
24+
\"B\",\n}\n\nExample 2:\n<QUESTION> Which search query shows more genuine curiosity? </QUESTION>\n<A> effect of
25+
ocean acidification feedback loops on Arctic methane release </A> \n<B> climate change effects </B>\nExpected output:\n{\n
26+
\ \"result\": \"A\",\n}\n\nExample 3:\n<QUESTION> Which reply is more insulting? </QUESTION>\n<A> Your argument
27+
lacks logical coherence and fails to address the core issue at hand. </A> \n<B> That's an interesting perspective,
28+
though I see it differently. </B>\nExpected output:\n{\n \"result\": \"A\",\n}\n</EXAMPLES>\n\n<REQUIREMENTS>\n1.
29+
Consider the question carefully. What aspects are important for the answer?\n2. Think about answer A. Is it a good
30+
answer to the question? Why (not)?\n3. Think about answer B. Is it a good answer to the question? Why (not)?\n4.
31+
Make a decision based on your analysis.\n</REQUIREMENTS>\n\n<OUTPUT_FORMAT>\nYou must respond with valid JSON containing
32+
exactly one field called \"response\" with value \"A\" or \"B\":\n\n{\n \"response\": \"A\",\n}\nor\n{\n \"response\":
33+
\"B\",\n}\n\nDo NOT include explanations, reasoning, or any other fields.\n</OUTPUT_FORMAT>\n"
34+
role: system
35+
- content: |2
36+
37+
<QUESTION> Which of the two ice cream flavours A or B is more creative? </QUESTION>
38+
<A> vanilla </A>
39+
<B> toasted rice & miso caramel ice cream </B>
40+
role: user
41+
model: qwen2.5:72b
42+
stream: false
43+
temperature: 0.0
44+
tool_choice: required
45+
tools:
46+
- function:
47+
description: The final response which ends this conversation
48+
name: final_result
49+
parameters:
50+
$defs:
51+
GameResult:
52+
description: Possible results of an evaluation game.
53+
enum:
54+
- A
55+
- B
56+
type: string
57+
additionalProperties: false
58+
properties:
59+
response:
60+
$ref: '#/$defs/GameResult'
61+
required:
62+
- response
63+
type: object
64+
strict: true
65+
type: function
66+
uri: http://localhost:11434/v1/chat/completions
67+
response:
68+
headers:
69+
content-length:
70+
- '430'
71+
content-type:
72+
- application/json
73+
parsed_body:
74+
choices:
75+
- finish_reason: tool_calls
76+
index: 0
77+
message:
78+
content: ''
79+
role: assistant
80+
tool_calls:
81+
- function:
82+
arguments: '{"response":"B"}'
83+
name: final_result
84+
id: call_x9801jnh
85+
index: 0
86+
type: function
87+
created: 1761389257
88+
id: chatcmpl-608
89+
model: qwen2.5:72b
90+
object: chat.completion
91+
system_fingerprint: fp_ollama
92+
usage:
93+
completion_tokens: 20
94+
prompt_tokens: 584
95+
total_tokens: 604
96+
status:
97+
code: 200
98+
message: OK
99+
version: 1

tests/evals/test_tournament.py

Lines changed: 72 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,80 @@
11
from __future__ import annotations as _annotations
22

3+
import pytest
4+
35
from ..conftest import try_import
46

57
with try_import() as imports_successful:
6-
from pydantic_evals.tournament import EvalPlayer
8+
from pydantic_ai import Agent
9+
from pydantic_ai.models.openai import OpenAIChatModel
10+
from pydantic_ai.providers.openai import OpenAIProvider
11+
from pydantic_ai.settings import ModelSettings
12+
from pydantic_evals.tournament import EVALUATION_INSTRUCTIONS, EvalGame, EvalPlayer, GameResult
13+
14+
pytestmark = [
15+
pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'),
16+
pytest.mark.anyio,
17+
]
18+
19+
MODEL_SETTINGS = ModelSettings(
20+
temperature=0.0, # Model needs to be deterministic for VCR recording to work.
21+
timeout=300,
22+
)
23+
24+
25+
@pytest.fixture
26+
def evaluation_agent() -> Agent[None, GameResult]:
27+
"""Create a test evaluation agent for tournament games."""
28+
return Agent(
29+
model=OpenAIChatModel(
30+
model_name='qwen2.5:72b',
31+
provider=OpenAIProvider(base_url='http://localhost:11434/v1'),
32+
),
33+
output_type=GameResult,
34+
system_prompt=EVALUATION_INSTRUCTIONS,
35+
retries=5,
36+
instrument=True,
37+
)
38+
39+
@pytest.fixture
40+
def ice_cream_players() -> list[EvalPlayer]:
41+
"""Provide a list of EvalPlayer instances with ice cream flavours."""
42+
return [
43+
EvalPlayer(idx=0, item='vanilla'),
44+
EvalPlayer(idx=1, item='chocolate'),
45+
EvalPlayer(idx=2, item='strawberry'),
46+
EvalPlayer(idx=3, item='peach'),
47+
EvalPlayer(idx=4, item='toasted rice & miso caramel ice cream'),
48+
]
49+
50+
51+
52+
def test_evalplayer() -> None:
53+
"""Test the EvalPlayer class."""
54+
player = EvalPlayer(
55+
idx=42,
56+
item='toasted rice & miso caramel ice cream',
57+
)
58+
assert player.idx == 42
59+
assert player.item == 'toasted rice & miso caramel ice cream'
60+
61+
62+
@pytest.mark.vcr
63+
async def test_evalgame(ice_cream_players: list[EvalPlayer], evaluation_agent: Agent[None, GameResult], allow_model_requests: None) -> None:
64+
"""Test the EvalGame class."""
765

66+
game = EvalGame(criterion='Which of the two ice cream flavours A or B is more creative?')
67+
assert game.criterion == 'Which of the two ice cream flavours A or B is more creative?'
868

9-
def test_evalplayer() -> None:
10-
"""
11-
Test the EvalPlayer class.
12-
"""
13-
player = EvalPlayer(
14-
idx=42,
15-
item='toasted rice & miso caramel ice cream',
16-
)
17-
assert player.idx == 42
18-
assert player.item == 'toasted rice & miso caramel ice cream'
69+
result = await game.run(
70+
players=(ice_cream_players[0], ice_cream_players[4]),
71+
agent=evaluation_agent,
72+
model_settings=MODEL_SETTINGS,
73+
)
1974

75+
assert isinstance(result, tuple)
76+
assert len(result) == 2
77+
assert all(isinstance(r, int) for r in result)
78+
assert result[0] in {0, 4} and result[1] in {0, 4}
79+
assert result[0] != result[1]
80+
assert result[0] == 4 # Toasted rice & miso caramel ice cream flavour is more creative.

0 commit comments

Comments
 (0)