Skip to content

Commit 02fad97

Browse files
committed
add strategies
1 parent 4c8a2e6 commit 02fad97

File tree

5 files changed

+4295
-3
lines changed

5 files changed

+4295
-3
lines changed

pydantic_evals/pydantic_evals/tournament.py

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
from __future__ import annotations as _annotations
22

3+
import math
4+
import random
35
import textwrap
46
from enum import Enum
57

8+
import choix
9+
import numpy as np
610
from pydantic import BaseModel, Field
711

812
from pydantic_ai import Agent
@@ -98,3 +102,219 @@ async def run(self, players: tuple[EvalPlayer, EvalPlayer], agent: Agent[None, G
98102
return (players[0].idx, players[1].idx)
99103
else:
100104
return (players[1].idx, players[0].idx)
105+
106+
107+
async def random_sampling_strategy(
108+
players: list[EvalPlayer],
109+
game: EvalGame,
110+
agent: Agent,
111+
model_settings: ModelSettings,
112+
fraction_of_games: float | None = None,
113+
) -> list[EvalPlayer]:
114+
"""Random sampling tournament strategy.
115+
116+
In a tournament with n players, there are n*(n-1) possible pairwise games. We consider
117+
(i, j) and (j, i) as different games in order to ensure that the evaluation agent does
118+
not introduce any ordering bias. The strategy plays all possible games in random order.
119+
The strategy is simple and not efficient. But when all games are played, it returns the
120+
best possible scores.
121+
122+
Args:
123+
players: List of players in the tournament.
124+
game: Game defining the pairwise comparisons.
125+
agent: Agent for the game.
126+
model_settings: Model settings for the game.
127+
fraction_of_games: Fraction of all possible games to be played. Between 0 and 1. If None, all games are played.
128+
129+
Returns:
130+
List of players with Bradley-Terry scores.
131+
"""
132+
scoreboard: list[tuple[int, int]] = []
133+
134+
# Generate all possible games
135+
n = len(players)
136+
matches = [(i, j) for i in range(n) for j in range(n) if i != j]
137+
random.shuffle(matches)
138+
if fraction_of_games is not None and 0 < fraction_of_games <= 1:
139+
number_of_games = int(len(matches) * fraction_of_games)
140+
matches = matches[:number_of_games]
141+
142+
# Play all games
143+
for i, match in enumerate(matches):
144+
player_1, player_2 = players[match[0]], players[match[1]]
145+
146+
result = await game.run(
147+
players=(player_1, player_2),
148+
agent=agent,
149+
model_settings=model_settings,
150+
)
151+
scoreboard.append(result)
152+
153+
# Calculate Bradley-Terry scores and update players
154+
scores = choix.ilsr_pairwise(len(players), scoreboard, alpha=0.01)
155+
for i, player in enumerate(players):
156+
player.score = float(scores[i])
157+
158+
return players
159+
160+
161+
async def round_robin_strategy(
162+
players: list[EvalPlayer],
163+
game: EvalGame,
164+
agent: Agent,
165+
model_settings: ModelSettings,
166+
number_of_rounds: int = 2,
167+
) -> list[EvalPlayer]:
168+
"""Round-robin tournament strategy.
169+
170+
Each player plays against a randomly selected opponent for a given number of rounds.
171+
The scores are calculated from the game outcomes using the Bradley-Terry algorithm.
172+
The strategy ensures that each player plays at least number_of_rounds games.
173+
The strategy is simple but not efficient.
174+
175+
Args:
176+
players: List of players in the tournament.
177+
game: Game defining the pairwise comparisons.
178+
agent: Agent for the game.
179+
model_settings: Model settings for the game.
180+
number_of_rounds: Number of rounds.
181+
182+
Returns:
183+
List of players with Bradley-Terry scores.
184+
"""
185+
scoreboard: list[tuple[int, int]] = []
186+
187+
for n in range(number_of_rounds):
188+
for player in players:
189+
# Pick a random opponent (excluding self)
190+
idx = random.randrange(len(players))
191+
while idx == player.idx:
192+
idx = random.randrange(len(players))
193+
player_2 = players[idx]
194+
195+
# Play the game
196+
result = await game.run(
197+
players=(player, player_2),
198+
agent=agent,
199+
model_settings=model_settings,
200+
)
201+
scoreboard.append(result)
202+
203+
# Calculate Bradley-Terry scores and update players
204+
scores = choix.ilsr_pairwise(len(players), scoreboard, alpha=0.01)
205+
for i, player in enumerate(players):
206+
player.score = float(scores[i])
207+
208+
return players
209+
210+
211+
async def adaptive_uncertainty_strategy(
212+
players: list[EvalPlayer],
213+
game: EvalGame,
214+
agent: Agent,
215+
model_settings: ModelSettings,
216+
max_standard_deviation: float = 2.0,
217+
alpha: float = 0.1,
218+
) -> list[EvalPlayer]:
219+
"""Adaptive uncertainty tournament strategy.
220+
221+
The strategy consists of two phases:
222+
(1) Bootstrap phase: The Bradley-Terry model requires the comparison graph to be strongly connected i.e.
223+
there must be a path between any two players. We therefore start by playing n/2*log(n) random games where
224+
n is the number of players. With fewer games, any scores are likely to be unreliable.
225+
(2) Optimization phase: In this phase, we iteratively calculate the Bradley-Terry scores and their
226+
covariance matrix, and play the game for which the player scores are the most uncertain.
227+
228+
Let s_i and s_j the Bradley-Terry scores of players i and j respectively. The uncertainty in their
229+
relative strength is then given by
230+
231+
Var(s_i - s_j) = Var(s_i) + Var(s_j) - 2*Cov(s_i, s_j)
232+
233+
We stop when the standard deviation sqrt(Var(s_i - s_j)) of the most uncertain pair drops below
234+
the threshold max_standard_deviation, or when all possible pairs have been played.
235+
236+
Comment on max_standard_deviation parameter:
237+
Typically, a standard deviation below 1.0 is a good stopping condition. However, the uncertainty
238+
depends greatly on the evaluation problem. For a problem such as "Which of the following ice cream
239+
flavours is the most creative one? Vanilla or Chocolate or Strawberry?", the uncertainty will remain
240+
high even after many games.
241+
242+
Comment on alpha parameter:
243+
The alpha parameter is the prior strength for the Bradley-Terry model. Higher alpha (e.g. 0.8) is a
244+
strong prior towards equal player strengths. The games have a smaller influence on the scores, and
245+
the scores remain close to the mean of 0. Lower alpha (e.g. 0.1) on the other hand lets the games
246+
influence the scores more strongly. However, for a sparse comparison graph, the scores can become
247+
less stable. Typical values are between 0.1 and 0.3.
248+
249+
Args:
250+
players: List of players in the tournament.
251+
game: Game defining the pairwise comparisons.
252+
agent: Agent for the game.
253+
model_settings: Model settings for the game.
254+
max_standard_deviation: Maximum standard deviation for the most uncertain pair. See also above.
255+
alpha: Prior strength for the Bradley-Terry model. Between 0 and 1. See also above.
256+
257+
Returns:
258+
List of players with Bradley-Terry scores.
259+
"""
260+
scoreboard: list[tuple[int, int]] = []
261+
n = len(players)
262+
263+
# (1) Bootstrap phase
264+
number_of_bootstrap_games = max(2 * n, int(n / 2 * np.log(n)))
265+
matches = [(i, j) for i in range(n) for j in range(n) if i != j]
266+
random.shuffle(matches)
267+
matches = matches[:number_of_bootstrap_games]
268+
for idx, match in enumerate(matches):
269+
player_1, player_2 = players[match[0]], players[match[1]]
270+
271+
result = await game.run(
272+
players=(player_1, player_2),
273+
agent=agent,
274+
model_settings=model_settings,
275+
)
276+
scoreboard.append(result)
277+
278+
# (2) Optimization phase
279+
max_number_of_games = int(n * (n - 1) / 2.0)
280+
for idx in range(max_number_of_games):
281+
# Calculate the Bradley-Terry scores and covariance matrix
282+
scores, cov_matrix = choix.ep_pairwise(n_items=n, data=scoreboard, alpha=alpha, model='logit')
283+
284+
# Find most uncertain pair which has not yet been played.
285+
max_uncertainty = -1.0
286+
next_pair: tuple[int, int] | None = None
287+
for i in range(n):
288+
for j in range(i + 1, n):
289+
# Check if the pair has already been played.
290+
# Here we assume that games are symmetric which is not quite correct but good enough.
291+
if (players[i].idx, players[j].idx) in scoreboard or (players[j].idx, players[i].idx) in scoreboard:
292+
continue
293+
294+
# Uncertainty of the pair
295+
uncertainty = cov_matrix[i, i] + cov_matrix[j, j] - 2 * cov_matrix[i, j]
296+
if uncertainty > max_uncertainty:
297+
max_uncertainty = uncertainty
298+
next_pair = (i, j)
299+
300+
# Terminate optimization phase?
301+
if next_pair is None:
302+
break
303+
if math.sqrt(max_uncertainty) < max_standard_deviation:
304+
break
305+
306+
# Play the most uncertain pair
307+
player_1, player_2 = players[next_pair[0]], players[next_pair[1]]
308+
result = await game.run(
309+
players=(player_1, player_2),
310+
agent=agent,
311+
model_settings=model_settings,
312+
)
313+
scoreboard.append(result)
314+
315+
# Final calculation of Bradley-Terry scores and update players
316+
scores, _ = choix.ep_pairwise(n_items=n, data=scoreboard, alpha=alpha, model='logit')
317+
for i, player in enumerate(players):
318+
player.score = float(scores[i])
319+
320+
return players

0 commit comments

Comments
 (0)