add EvalGame

lars20070 · lars20070 · commit 4c8a2e601938 · 2025-10-25T12:50:16.000+02:00
diff --git a/pydantic_evals/pydantic_evals/tournament.py b/pydantic_evals/pydantic_evals/tournament.py
@@ -1,10 +1,100 @@
 from __future__ import annotations as _annotations
 
+import textwrap
+from enum import Enum
+
 from pydantic import BaseModel, Field
 
+from pydantic_ai import Agent
+from pydantic_ai.settings import ModelSettings
+
+EVALUATION_INSTRUCTIONS = """
+You are presented with a question and two possible answers A and B. Evaluate carefully whether answer A or answer B is the better reply.
+You have got only these two options. Your evaluations contribute to Bradley-Terry scores across multiple items. Consistency and
+objectivity are critical for reliable rankings. Each comparison should be independent but internally consistent.
+
+<EXAMPLES>
+Example 1:
+<QUESTION> Which of the two ice cream flavours below is more creative? </QUESTION>
+<A> Vanilla </A> 
+<B> Pickled Citrus Ribbon </B>
+Expected output:
+{
+    "result": "B",
+}
+
+Example 2:
+<QUESTION> Which search query shows more genuine curiosity? </QUESTION>
+<A> effect of ocean acidification feedback loops on Arctic methane release </A> 
+<B> climate change effects </B>
+Expected output:
+{
+    "result": "A",
+}
+
+Example 3:
+<QUESTION> Which reply is more insulting? </QUESTION>
+<A> Your argument lacks logical coherence and fails to address the core issue at hand. </A> 
+<B> That's an interesting perspective, though I see it differently. </B>
+Expected output:
+{
+    "result": "A",
+}
+</EXAMPLES>
+
+<REQUIREMENTS>
+1. Consider the question carefully. What aspects are important for the answer?
+2. Think about answer A. Is it a good answer to the question? Why (not)?
+3. Think about answer B. Is it a good answer to the question? Why (not)?
+4. Make a decision based on your analysis.
+</REQUIREMENTS>
+
+<OUTPUT_FORMAT>
+You must respond with valid JSON containing exactly one field called "response" with value "A" or "B":
+
+{
+    "response": "A",
+}
+or
+{
+    "response": "B",
+}
+
+Do NOT include explanations, reasoning, or any other fields.
+</OUTPUT_FORMAT>
+"""
 
 class EvalPlayer(BaseModel):
     """Player in a Bradley-Terry tournament."""
     idx: int = Field(..., description='unique identifier for the player')
     item: str = Field(..., description='item to be scored')
     score: float | None = Field(default=None, description='Bradley-Terry strength score for the item')
+
+
+class GameResult(str, Enum):
+    """Possible results of an evaluation game."""
+    A = 'A'
+    B = 'B'
+
+
+class EvalGame(BaseModel):
+    """Represents a game between two players in the evaluation tournament."""
+    criterion: str = Field(..., description='evaluation criterion on which players should be judged')
+
+    async def run(self, players: tuple[EvalPlayer, EvalPlayer], agent: Agent[None, GameResult], model_settings: ModelSettings) -> tuple[int, int]:
+        prompt = textwrap.dedent(f"""
+            <QUESTION> {self.criterion} </QUESTION>
+            <A> {players[0].item} </A>
+            <B> {players[1].item} </B>
+        """)
+
+        async with agent:
+            result = await agent.run(
+                user_prompt=prompt,
+                model_settings=model_settings,
+            )
+
+        if result.output == GameResult.A:
+            return (players[0].idx, players[1].idx)
+        else:
+            return (players[1].idx, players[0].idx)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -296,7 +296,7 @@ def mock_vcr_aiohttp_content(mocker: MockerFixture):
 @pytest.fixture(scope='module')
 def vcr_config():
     return {
-        'ignore_localhost': True,
+        'ignore_localhost': False,
         # Note: additional header filtering is done inside the serializer
         'filter_headers': ['authorization', 'x-api-key'],
         'decode_compressed_response': True,
diff --git a/tests/evals/cassettes/test_tournament/test_evalgame.yaml b/tests/evals/cassettes/test_tournament/test_evalgame.yaml
@@ -0,0 +1,99 @@
+interactions:
+- request:
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      connection:
+      - keep-alive
+      content-length:
+      - '2432'
+      content-type:
+      - application/json
+      host:
+      - localhost:11434
+    method: POST
+    parsed_body:
+      messages:
+      - content: "\nYou are presented with a question and two possible answers A and B. Evaluate carefully whether answer
+          A or answer B is the better reply.\nYou have got only these two options. Your evaluations contribute to Bradley-Terry
+          scores across multiple items. Consistency and\nobjectivity are critical for reliable rankings. Each comparison should
+          be independent but internally consistent.\n\n<EXAMPLES>\nExample 1:\n<QUESTION> Which of the two ice cream flavours
+          below is more creative? </QUESTION>\n<A> Vanilla </A> \n<B> Pickled Citrus Ribbon </B>\nExpected output:\n{\n    \"result\":
+          \"B\",\n}\n\nExample 2:\n<QUESTION> Which search query shows more genuine curiosity? </QUESTION>\n<A> effect of
+          ocean acidification feedback loops on Arctic methane release </A> \n<B> climate change effects </B>\nExpected output:\n{\n
+          \   \"result\": \"A\",\n}\n\nExample 3:\n<QUESTION> Which reply is more insulting? </QUESTION>\n<A> Your argument
+          lacks logical coherence and fails to address the core issue at hand. </A> \n<B> That's an interesting perspective,
+          though I see it differently. </B>\nExpected output:\n{\n    \"result\": \"A\",\n}\n</EXAMPLES>\n\n<REQUIREMENTS>\n1.
+          Consider the question carefully. What aspects are important for the answer?\n2. Think about answer A. Is it a good
+          answer to the question? Why (not)?\n3. Think about answer B. Is it a good answer to the question? Why (not)?\n4.
+          Make a decision based on your analysis.\n</REQUIREMENTS>\n\n<OUTPUT_FORMAT>\nYou must respond with valid JSON containing
+          exactly one field called \"response\" with value \"A\" or \"B\":\n\n{\n    \"response\": \"A\",\n}\nor\n{\n    \"response\":
+          \"B\",\n}\n\nDo NOT include explanations, reasoning, or any other fields.\n</OUTPUT_FORMAT>\n"
+        role: system
+      - content: |2
+
+          <QUESTION> Which of the two ice cream flavours A or B is more creative? </QUESTION>
+          <A> vanilla </A>
+          <B> toasted rice & miso caramel ice cream </B>
+        role: user
+      model: qwen2.5:72b
+      stream: false
+      temperature: 0.0
+      tool_choice: required
+      tools:
+      - function:
+          description: The final response which ends this conversation
+          name: final_result
+          parameters:
+            $defs:
+              GameResult:
+                description: Possible results of an evaluation game.
+                enum:
+                - A
+                - B
+                type: string
+            additionalProperties: false
+            properties:
+              response:
+                $ref: '#/$defs/GameResult'
+            required:
+            - response
+            type: object
+          strict: true
+        type: function
+    uri: http://localhost:11434/v1/chat/completions
+  response:
+    headers:
+      content-length:
+      - '430'
+      content-type:
+      - application/json
+    parsed_body:
+      choices:
+      - finish_reason: tool_calls
+        index: 0
+        message:
+          content: ''
+          role: assistant
+          tool_calls:
+          - function:
+              arguments: '{"response":"B"}'
+              name: final_result
+            id: call_x9801jnh
+            index: 0
+            type: function
+      created: 1761389257
+      id: chatcmpl-608
+      model: qwen2.5:72b
+      object: chat.completion
+      system_fingerprint: fp_ollama
+      usage:
+        completion_tokens: 20
+        prompt_tokens: 584
+        total_tokens: 604
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/evals/test_tournament.py b/tests/evals/test_tournament.py
@@ -1,19 +1,80 @@
 from __future__ import annotations as _annotations
 
+import pytest
+
 from ..conftest import try_import
 
 with try_import() as imports_successful:
-    from pydantic_evals.tournament import EvalPlayer
+    from pydantic_ai import Agent
+    from pydantic_ai.models.openai import OpenAIChatModel
+    from pydantic_ai.providers.openai import OpenAIProvider
+    from pydantic_ai.settings import ModelSettings
+    from pydantic_evals.tournament import EVALUATION_INSTRUCTIONS, EvalGame, EvalPlayer, GameResult
+
+pytestmark = [
+    pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'),
+    pytest.mark.anyio,
+]
+
+MODEL_SETTINGS = ModelSettings(
+    temperature=0.0,  # Model needs to be deterministic for VCR recording to work.
+    timeout=300,
+)
+
+
+@pytest.fixture
+def evaluation_agent() -> Agent[None, GameResult]:
+    """Create a test evaluation agent for tournament games."""
+    return Agent(
+        model=OpenAIChatModel(
+            model_name='qwen2.5:72b',
+            provider=OpenAIProvider(base_url='http://localhost:11434/v1'),
+        ),
+        output_type=GameResult,
+        system_prompt=EVALUATION_INSTRUCTIONS,
+        retries=5,
+        instrument=True,
+    )
+
+@pytest.fixture
+def ice_cream_players() -> list[EvalPlayer]:
+    """Provide a list of EvalPlayer instances with ice cream flavours."""
+    return [
+        EvalPlayer(idx=0, item='vanilla'),
+        EvalPlayer(idx=1, item='chocolate'),
+        EvalPlayer(idx=2, item='strawberry'),
+        EvalPlayer(idx=3, item='peach'),
+        EvalPlayer(idx=4, item='toasted rice & miso caramel ice cream'),
+    ]
+
+
+
+def test_evalplayer() -> None:
+    """Test the EvalPlayer class."""
+    player = EvalPlayer(
+        idx=42,
+        item='toasted rice & miso caramel ice cream',
+    )
+    assert player.idx == 42
+    assert player.item == 'toasted rice & miso caramel ice cream'
+
+
+@pytest.mark.vcr
+async def test_evalgame(ice_cream_players: list[EvalPlayer], evaluation_agent: Agent[None, GameResult], allow_model_requests: None) -> None:
+    """Test the EvalGame class."""
 
+    game = EvalGame(criterion='Which of the two ice cream flavours A or B is more creative?')
+    assert game.criterion == 'Which of the two ice cream flavours A or B is more creative?'
 
-    def test_evalplayer() -> None:
-        """
-        Test the EvalPlayer class.
-        """
-        player = EvalPlayer(
-            idx=42,
-            item='toasted rice & miso caramel ice cream',
-        )
-        assert player.idx == 42
-        assert player.item == 'toasted rice & miso caramel ice cream'
+    result = await game.run(
+        players=(ice_cream_players[0], ice_cream_players[4]),
+        agent=evaluation_agent,
+        model_settings=MODEL_SETTINGS,
+    )
 
+    assert isinstance(result, tuple)
+    assert len(result) == 2
+    assert all(isinstance(r, int) for r in result)
+    assert result[0] in {0, 4} and result[1] in {0, 4}
+    assert result[0] != result[1]
+    assert result[0] == 4  # Toasted rice & miso caramel ice cream flavour is more creative.