OpenJury/openjury/evaluate.py at ba6022b0ffda875f2e0090635c2b7e9fd952aa7d · OpenEuroLLM/OpenJury · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import json
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path

import numpy as np
import pandas as pd
from langchain.prompts import ChatPromptTemplate
from langchain_core.language_models.llms import LLM

from openjury.instruction_dataset import load_instructions
from openjury.repro import write_run_metadata, _to_jsonable
from openjury.utils import (
    read_df,
    data_root,
    download_hf,
    do_inference,
)


class PairScore:
    def __init__(self):
        super(PairScore).__init__()
        self.temperature = 0.3

    def preference_from_scores(self, score_a: float, score_b: float) -> float:
        return 1 - np.exp(self.temperature * score_a) / (
            np.exp(self.temperature * np.array([score_a, score_b])).sum()
        )

    def parse_model_raw(self, judge_completion: str) -> float | None:
        # lower case to avoid confusion, e.g. when "a" is used instead of "A"
        score_a = self.get_regexp_match(
            judge_completion.lower(), r'score.*?a[": *\n]*(-?\d+)'
        )
        score_b = self.get_regexp_match(
            judge_completion.lower(), r'score.*?b[": *\n]*(-?\d+)'
        )
        if score_a is None or score_b is None:
            return None
        else:
            return float(self.preference_from_scores(score_a, score_b))

    def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
        m = re.search(re.compile(regex), s)
        if m is None:
            return None
        else:
            return float(m.group(group_index).strip(" "))


def load_judge_system_and_user_prompt(
    provide_explanation: bool = True,
) -> tuple[str, str]:
    # Prepare judge
    with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f:
        system_prompt = str(f.read())

    prompt_filename = (
        "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
    )
    with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f:
        user_prompt_template = str(f.read())

    return system_prompt, user_prompt_template


def resolve_judge_prompts(
    *,
    provide_explanation: bool,
    system_prompt: str | None = None,
    user_prompt_template: str | None = None,
) -> tuple[str, str]:
    default_system_prompt, default_user_prompt_template = (
        load_judge_system_and_user_prompt(provide_explanation=provide_explanation)
    )
    return (
        system_prompt if system_prompt is not None else default_system_prompt,
        (
            user_prompt_template
            if user_prompt_template is not None
            else default_user_prompt_template
        ),
    )


def evaluate_completions(
    dataset: str = "alpaca-eval",
    judge_chat_model: LLM = None,
    method_A: str = "gpt4_1106_preview",
    method_B: str = "llama-2-70b-chat-hf",
    num_annotations: int | None = 50,
    use_tqdm: bool = False,
    truncate_input_chars: int | None = 8192,
    provide_explanation: bool = False,
):
    """
    :param dataset:
    :param judge_chat_model:
    :param method_A: one method to evaluate, can be a method existing in `dataset` or a local path to the completion
    of a local method. The path should be a dataframe ending with ".csv.zip" or ".parquet", have columns
    "instruction_index" and "output" and should contains all the instruction of `dataset`.
    :param method_B: another method to evaluate against `method_A`
    :param num_annotations: if specified will do at most `num_annotations` annotations
    :param use_tqdm:
    :param truncate_input_chars: if specified, truncates the length of completion, useful to save cost and avoid
    exceeding context limit
    :return:
    """
    run_started_at = datetime.now(timezone.utc)
    local_path_tables = data_root / "tables"
    download_hf(name=dataset, local_path=local_path_tables)

    instructions = load_instructions(
        dataset=dataset,
    ).loc[:, "instruction"]

    # A bit ugly, only loads if local path exist as we do not have a local path of completion for cases such as
    # m-arena-hard.
    dataset_output_path = local_path_tables / "model_outputs" / f"{dataset}.csv.zip"
    if dataset_output_path.exists():
        df_outputs = read_df(dataset_output_path)
        # empty strings are encoded as Nan in csv
        df_outputs.loc[:, "output"] = df_outputs.loc[:, "output"].fillna("")
        df_outputs = df_outputs.pivot_table(
            index="instruction_index", columns="model", values="output", aggfunc="last"
        ).sort_index()
        df_outputs = df_outputs.loc[instructions.index]
    else:
        df_outputs = None

    def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
        if Path(method).exists():
            print(f"Path {method} exists, loads local model completions.")
            df = read_df(Path(method)).set_index("instruction_index").sort_index()
            print(f"Loaded {len(df)} completions.")
            df.loc[:, "output"] = df.loc[:, "output"].fillna("")
            return df.loc[:, "output"]
        else:
            print(f"Loading {method} from {dataset} dataset.")
            assert (
                method in df_outputs.columns
            ), f"Method {method} not present, pick among {df_outputs.columns.tolist()}"
            return df_outputs.loc[:, method].sort_index()

    completions_A = get_output(df_outputs=df_outputs, dataset=dataset, method=method_A)
    completions_B = get_output(df_outputs=df_outputs, dataset=dataset, method=method_B)
    if num_annotations is not None:
        instructions = instructions.head(num_annotations)
        completions_A = completions_A.head(num_annotations)
        completions_B = completions_B.head(num_annotations)
    assert (
        completions_A.index.tolist() == completions_B.index.tolist()
    ), f"Index mismatch between methods {method_A} and {method_B}."

    if judge_chat_model is None:
        from langchain_together.llms import Together

        judge_chat_model = Together(model="meta-llama/Llama-3.3-70B-Instruct-Turbo")

    (
        judge_system_prompt,
        judge_user_prompt_template,
    ) = resolve_judge_prompts(provide_explanation=provide_explanation)

    annotations = annotate_battles(
        judge_chat_model=judge_chat_model,
        instructions=instructions.tolist(),
        completions_A=completions_A.loc[instructions.index].tolist(),
        completions_B=completions_B.loc[instructions.index].tolist(),
        use_tqdm=use_tqdm,
        truncate_input_chars=truncate_input_chars,
        provide_explanation=provide_explanation,
    )

    # print("--------\n".join([str(x) for x in annotations]))
    # print results in term of 1) winrate 2) number of win/loss
    prefs = pd.Series([annotation.preference for annotation in annotations])
    num_wins = sum(prefs < 0.5)
    num_losses = sum(prefs > 0.5)
    num_ties = sum([1 if not x or x == 0.5 or x == np.nan else 0 for x in prefs])
    num_battles = len(prefs)
    winrate = float((num_wins + 0.5 * num_ties) / (num_ties + num_wins + num_losses))

    results = {
        "num_battles": num_battles,
        "winrate": winrate,
        "num_wins": num_wins,
        "num_losses": num_losses,
        "num_ties": num_ties,
    }

    print(f"{method_A} against {method_B}:\n{results}")
    print([annotation.preference for annotation in annotations])

    unique_string = dataset + "-" + datetime.now().strftime("%Y%m%d_%H%M%S")
    output_folder = data_root / "judge-evals" / unique_string
    print(f"Saving results in {output_folder}")
    output_folder.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(annotations).to_csv(output_folder / "annotations.csv", index=False)
    with open(output_folder / "results.json", "w") as f:
        json.dump(_to_jsonable(results), f, allow_nan=False)

    try:
        write_run_metadata(
            output_dir=output_folder,
            entrypoint="openjury.evaluate.evaluate_completions",
            run={
                "dataset": dataset,
                "method_A": method_A,
                "method_B": method_B,
                "num_annotations": num_annotations,
                "n_annotations": len(instructions),
                "use_tqdm": use_tqdm,
                "truncate_input_chars": truncate_input_chars,
                "provide_explanation": provide_explanation,
            },
            results=results,
            input_payloads={
                "instruction_index": instructions.index.tolist(),
                "instructions": instructions.tolist(),
                "completions_A": completions_A.loc[instructions.index].tolist(),
                "completions_B": completions_B.loc[instructions.index].tolist(),
            },
            extras={
                "files": {
                    "annotations": "annotations.csv",
                    "results": "results.json",
                }
            },
            judge_system_prompt=judge_system_prompt,
            judge_user_prompt_template=judge_user_prompt_template,
            started_at_utc=run_started_at,
        )
    except Exception as e:
        print(f"Warning: failed to write run metadata: {e}")


@dataclass
class JudgeAnnotation:
    judge_completion: str
    instruction: str
    completion_A: str
    completion_B: str


def annotate_battles(
    judge_chat_model,
    instructions: list[str],
    completions_A: list[str],
    completions_B: list[str],
    system_prompt: str | None = None,
    user_prompt_template: str = None,
    truncate_input_chars: int | None = 8192,
    use_tqdm: bool = False,
    provide_explanation: bool = False,
) -> list[JudgeAnnotation]:
    """
    Directly evaluate from list of instructions and completions
    Can also pass custom LLM judge prompts, if not passed uses defaults
    `system_prompt, user_prompt_template = load_judge_system_and_user_prompt()`
    Example usage:
    ```python
    annotations = annotate_battles(
        # can be any langchain ChatModel, supports OpenAI, Together, vLLM, ...
        judge_chat_model=Together(model="meta-llama/Llama-3.3-70B-Instruct-Turbo"),
        # the instructions we want to evaluate
        user_prompts=["Write numbers between 1 and 5."],
        # the completions we want to evaluate for the first model
        completions_A=["1 2 3 4 5."],
        # the completions we want to evaluate for the second model
        completions_B=["No"],
    )
    ```
    :param provide_explanation:
    :param judge_chat_model:
    :param instructions:
    :param completions_A:
    :param completions_B:
    :param system_prompt:
    :param user_prompt_template:
    :param truncate_input_chars: Max characters to truncate completions before sending to judge.
    :param use_tqdm:
    :return:
    """
    # alternatively pass list of tuples
    assert len(instructions) == len(completions_A) == len(completions_B)

    system_prompt, user_prompt_template = resolve_judge_prompts(
        provide_explanation=provide_explanation,
        system_prompt=system_prompt,
        user_prompt_template=user_prompt_template,
    )

    prompt_template = ChatPromptTemplate.from_messages(
        [("system", system_prompt), ("user", user_prompt_template)]
    )

    def truncate(s: str, max_len: int | None = None):
        if not isinstance(s, str):
            return ""
        if max_len is not None:
            return s[:max_len]
        else:
            return s

    inputs = prompt_template.batch(
        [
            {
                "user_prompt": user_prompt,
                "completion_A": truncate(completion_A, max_len=truncate_input_chars),
                "completion_B": truncate(completion_B, max_len=truncate_input_chars),
            }
            for user_prompt, completion_A, completion_B in zip(
                instructions, completions_A, completions_B
            )
        ]
    )
    print(f"Start LLM judge annotation ({len(inputs)} annotations).")
    judge_completions = do_inference(
        chat_model=judge_chat_model,
        inputs=inputs,
        use_tqdm=use_tqdm,
    )

    annotations = []
    for judge_completion, instruction, completion_A, completion_B in zip(
        judge_completions, instructions, completions_A, completions_B
    ):
        annotations.append(
            JudgeAnnotation(
                judge_completion=judge_completion,
                instruction=instruction,
                completion_A=completion_A,
                completion_B=completion_B,
            )
        )
    return annotations