Skip to content

Commit 87871b3

Browse files
adtyavrdhnDouweM
andauthored
Fix LLMJudge input handling to preserve BinaryContent as separate message part instead of stringifying (#2173)
Co-authored-by: Douwe Maan <[email protected]>
1 parent 091c499 commit 87871b3

File tree

2 files changed

+234
-59
lines changed

2 files changed

+234
-59
lines changed

pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py

Lines changed: 45 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
from __future__ import annotations
22

3+
from collections.abc import Sequence
34
from textwrap import dedent
45
from typing import Any
56

67
from pydantic import BaseModel, Field
78
from pydantic_core import to_json
89

910
from pydantic_ai import Agent, models
11+
from pydantic_ai.messages import MultiModalContentTypes, UserContent
1012
from pydantic_ai.settings import ModelSettings
1113

1214
__all__ = (
@@ -62,16 +64,7 @@ async def judge_output(
6264
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
6365
but this can be changed using the `set_default_judge_model` function.
6466
"""
65-
user_prompt = dedent(
66-
f"""
67-
<Output>
68-
{_stringify(output)}
69-
</Output>
70-
<Rubric>
71-
{rubric}
72-
</Rubric>
73-
"""
74-
)
67+
user_prompt = _build_prompt(output=output, rubric=rubric)
7568
return (
7669
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
7770
).output
@@ -112,19 +105,8 @@ async def judge_input_output(
112105
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
113106
but this can be changed using the `set_default_judge_model` function.
114107
"""
115-
user_prompt = dedent(
116-
f"""
117-
<Input>
118-
{_stringify(inputs)}
119-
</Input>
120-
<Output>
121-
{_stringify(output)}
122-
</Output>
123-
<Rubric>
124-
{rubric}
125-
</Rubric>
126-
"""
127-
)
108+
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)
109+
128110
return (
129111
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
130112
).output
@@ -168,22 +150,7 @@ async def judge_input_output_expected(
168150
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
169151
but this can be changed using the `set_default_judge_model` function.
170152
"""
171-
user_prompt = dedent(
172-
f"""
173-
<Input>
174-
{_stringify(inputs)}
175-
</Input>
176-
<ExpectedOutput>
177-
{_stringify(expected_output)}
178-
</ExpectedOutput>
179-
<Output>
180-
{_stringify(output)}
181-
</Output>
182-
<Rubric>
183-
{rubric}
184-
</Rubric>
185-
"""
186-
)
153+
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)
187154

188155
return (
189156
await _judge_input_output_expected_agent.run(
@@ -227,19 +194,7 @@ async def judge_output_expected(
227194
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
228195
but this can be changed using the `set_default_judge_model` function.
229196
"""
230-
user_prompt = dedent(
231-
f"""
232-
<ExpectedOutput>
233-
{_stringify(expected_output)}
234-
</ExpectedOutput>
235-
<Output>
236-
{_stringify(output)}
237-
</Output>
238-
<Rubric>
239-
{rubric}
240-
</Rubric>
241-
"""
242-
)
197+
user_prompt = _build_prompt(output=output, rubric=rubric, expected_output=expected_output)
243198
return (
244199
await _judge_output_expected_agent.run(
245200
user_prompt, model=model or _default_model, model_settings=model_settings
@@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
265220
return to_json(value).decode()
266221
except Exception:
267222
return repr(value)
223+
224+
225+
def _build_prompt(
226+
output: Any,
227+
rubric: str,
228+
inputs: Any | None = None,
229+
expected_output: Any | None = None,
230+
) -> str | Sequence[str | UserContent]:
231+
"""Build a prompt that includes input, output, and rubric."""
232+
sections: list[str | UserContent] = []
233+
234+
if inputs is not None:
235+
if isinstance(inputs, str):
236+
sections.append(f'<Input>\n{inputs}\n</Input>')
237+
else:
238+
sections.append('<Input>\n')
239+
if isinstance(inputs, Sequence):
240+
for item in inputs: # type: ignore
241+
if isinstance(item, (str, MultiModalContentTypes)):
242+
sections.append(item)
243+
else:
244+
sections.append(_stringify(item))
245+
elif isinstance(inputs, MultiModalContentTypes):
246+
sections.append(inputs)
247+
else:
248+
sections.append(_stringify(inputs))
249+
sections.append('</Input>')
250+
251+
sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
252+
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
253+
254+
if expected_output is not None:
255+
sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
256+
257+
if inputs is None or isinstance(inputs, str):
258+
return '\n\n'.join(sections) # type: ignore[arg-type]
259+
else:
260+
return sections

0 commit comments

Comments
 (0)