OpenEuroLLM · ErlisLushtaku · Feb 14, 2026 · Feb 14, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ Compared to other libraries, here is a breakdown of features:
 | **Arena-Hard-Auto** | ❌  | ❌  | ✅  | ❌  | ❌                         | ❌                                            |
 | **Lighteval** | ✅  | ❌  | ❌  | ❌  | ❌                         | ❌                                       |
 | **Evalchemy** | ✅  | ✅  | ❌  | ❌  | ❌                         | ❌                                           |
-| **OpenJury** | 🔜  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
+| **OpenJury** | ✅  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
 
 The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue 
 or send a PR, we will be happy to update the information.
@@ -191,10 +191,29 @@ python openjury/generate_and_evaluate.py \
 
 This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side.
 
+### MT-Bench (Multi-Turn Evaluation)
+
+MT-Bench evaluates multi-turn conversation ability using 80 two-turn questions across 8 categories
+(writing, roleplay, reasoning, math, coding, extraction, STEM, humanities).
+It uses category-dependent judge prompts and reference answers for math/reasoning/coding.
+Questions are automatically downloaded from the [LMSYS MT-Bench HuggingFace space](https://huggingface.co/spaces/lmsys/mt-bench).
+
+```bash
+uv run python openjury/generate_and_evaluate.py \
+  --dataset mt-bench \
+  --model_A VLLM/Qwen/Qwen2.5-7B-Instruct \
+  --model_B OpenRouter/openai/gpt-4o \
+  --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+  --n_instructions 10
+```
+
+Results include per-category and per-turn win rate breakdowns. Use `--swap_mode both` to correct for judge position bias.
+
 ## 📊 Supported Datasets
 
 | Dataset               | Description                                                                                    |
 |-----------------------|------------------------------------------------------------------------------------------------|
+| `mt-bench`            | 80 multi-turn (2-turn) questions across 8 categories ([LMSYS MT-Bench](https://arxiv.org/abs/2306.05685)) |
 | `alpaca-eval`         | General instruction-following benchmark                                                        |
 | `arena-hard`          | More challenging evaluation suite                                                              |
 | `m-arena-hard`        | Translated version of Arena-Hard in 23 languages                                               |

diff --git a/openjury/evaluate.py b/openjury/evaluate.py
@@ -15,6 +15,7 @@
     data_root,
     download_hf,
     do_inference,
+    truncate,
 )
 
 
@@ -49,18 +50,29 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
             return float(m.group(group_index).strip(" "))
 
 
+_COMPLETION_LABEL_SINGLE = "Answer"
+_COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
+_EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
+_SCORE_FENCE = "\n```"
+
+
 def load_judge_system_and_user_prompt(
     provide_explanation: bool = True,
+    multi_turn: bool = False,
 ) -> tuple[str, str]:
-    # Prepare judge
-    with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f:
-        system_prompt = str(f.read())
+    prompts_dir = Path(__file__).parent / "prompts"
+
+    system_prompt = (prompts_dir / "system-prompt.txt").read_text()
 
-    prompt_filename = (
-        "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
+    user_prompt_template = (prompts_dir / "prompt.txt").read_text()
+    user_prompt_template = user_prompt_template.replace(
+        "{completion_label}",
+        _COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE,
+    )
+    user_prompt_template = user_prompt_template.replace(
+        "{explanation_suffix}",
+        _EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE,
     )
-    with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f:
-        user_prompt_template = str(f.read())
 
     return system_prompt, user_prompt_template
 
@@ -240,14 +252,6 @@ def annotate_battles(
         [("system", system_prompt), ("user", user_prompt_template)]
     )
 
-    def truncate(s: str, max_len: int | None = None):
-        if not isinstance(s, str):
-            return ""
-        if max_len is not None:
-            return s[:max_len]
-        else:
-            return s
-
     inputs = prompt_template.batch(
         [
             {

diff --git a/openjury/generate.py b/openjury/generate.py
@@ -1,17 +1,59 @@
 import pandas as pd
 from langchain.prompts import ChatPromptTemplate
+from typing import Any
 
 from openjury.utils import (
     do_inference,
     make_model,
+    truncate,
 )
 
 
-def truncate(s: str, max_len: int | None = None):
-    if max_len is not None:
-        return s[:max_len]
-    else:
-        return s
+def _set_temperature_on_model(chat_model, temperature: float) -> None:
+    if hasattr(chat_model, "set_temperature"):
+        chat_model.set_temperature(temperature)
+        return
+    if hasattr(chat_model, "temperature"):
+        setattr(chat_model, "temperature", temperature)
+
+
+def _infer_grouped_by_temperature(
+    *,
+    model_spec: str,
+    provider: str,
+    max_tokens: int | None,
+    model_kwargs: dict[str, Any],
+    base_model,
+    inputs: list,
+    temperatures: list[float],
+    use_tqdm: bool,
+) -> list[str]:
+    outputs: list[str] = [""] * len(inputs)
+    groups: dict[float, list[int]] = {}
+    for idx, temp in enumerate(temperatures):
+        groups.setdefault(float(temp), []).append(idx)
+
+    for temp in sorted(groups.keys()):
+        idxs = groups[temp]
+        group_inputs = [inputs[i] for i in idxs]
+
+        if provider in {"VLLM", "LlamaCpp"}:
+            _set_temperature_on_model(base_model, temp)
+            group_model = base_model
+        else:
+            group_model = make_model(
+                model_spec, max_tokens=max_tokens, temperature=temp, **model_kwargs
+            )
+
+        group_outs = do_inference(
+            chat_model=group_model,
+            inputs=group_inputs,
+            use_tqdm=use_tqdm,
+        )
+        for i, out in zip(idxs, group_outs):
+            outputs[i] = out
+
+    return outputs
 
 
 def generate_instructions(
@@ -57,6 +99,136 @@ def generate_instructions(
     return df_outputs
 
 
+def generate_multiturn(
+    questions: pd.DataFrame,
+    model: str,
+    truncate_input_chars: int | None = 8192,
+    max_tokens: int | None = 8192,
+    use_tqdm: bool = True,
+    temperature_config: dict[str, float] | None = None,
+    **model_kwargs,
+) -> pd.DataFrame:
+    """Generate two-turn completions for MT-Bench style questions.
+
+    Generates turn 1 answers first, then uses them as conversation context
+    to generate turn 2 answers.
+
+    Args:
+        questions: DataFrame with columns turn_1, turn_2, and index instruction_index.
+        model: Model specification string (e.g. "VLLM/model-name").
+        temperature_config: Optional category -> temperature mapping. When set,
+            inputs are inferred in temperature-homogeneous groups to match
+            MT-Bench/FastChat category defaults.
+        **model_kwargs: Provider-specific options forwarded to make_model
+            (e.g. max_model_len, chat_template for VLLM).
+    Returns:
+        DataFrame with columns: instruction_index, completion_turn_1, completion_turn_2
+    """
+    provider = model.split("/")[0]
+    use_category_temperatures = temperature_config is not None
+    local_provider = provider in {"VLLM", "LlamaCpp"}
+
+    chat_model = None
+    if use_category_temperatures and local_provider:
+        chat_model = make_model(model, max_tokens=max_tokens, temperature=0.0, **model_kwargs)
+    else:
+        chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs)
+
+    system_prompt = "You are a helpful assistant."
+    idxs = questions.index.tolist()
+    temperatures: list[float] = []
+    if use_category_temperatures:
+        temperatures = [
+            temperature_config.get(str(questions.loc[idx].get("category") or ""), 0.7)
+            for idx in idxs
+        ]
+
+    turn1_template = ChatPromptTemplate.from_messages(
+        [("system", system_prompt), ("user", "{user_prompt}")]
+    )
+
+    turn1_inputs = turn1_template.batch(
+        [
+            {"user_prompt": truncate(row["turn_1"], max_len=truncate_input_chars)}
+            for _, row in questions.iterrows()
+        ]
+    )
+
+    print(f"Generating turn 1 completions ({len(turn1_inputs)} questions).")
+    if use_category_temperatures:
+        completions_turn_1 = _infer_grouped_by_temperature(
+            model_spec=model,
+            provider=provider,
+            max_tokens=max_tokens,
+            model_kwargs=model_kwargs,
+            base_model=chat_model,
+            inputs=turn1_inputs,
+            temperatures=temperatures,
+            use_tqdm=use_tqdm,
+        )
+    else:
+        completions_turn_1 = do_inference(
+            chat_model=chat_model,
+            inputs=turn1_inputs,
+            use_tqdm=use_tqdm,
+        )
+
+    turn2_inputs = []
+    for (_, row), t1_answer in zip(questions.iterrows(), completions_turn_1):
+        if row["turn_2"] is None:
+            turn2_inputs.append(
+                turn1_template.invoke(
+                    {"user_prompt": "No follow-up question."}
+                )
+            )
+        else:
+            multi_turn_template = ChatPromptTemplate.from_messages(
+                [
+                    ("system", system_prompt),
+                    ("user", "{turn_1}"),
+                    ("assistant", "{turn_1_answer}"),
+                    ("user", "{turn_2}"),
+                ]
+            )
+            turn2_inputs.append(
+                multi_turn_template.invoke(
+                    {
+                        "turn_1": truncate(row["turn_1"], max_len=truncate_input_chars),
+                        "turn_1_answer": truncate(str(t1_answer), max_len=truncate_input_chars),
+                        "turn_2": truncate(row["turn_2"], max_len=truncate_input_chars),
+                    }
+                )
+            )
+
+    print(f"Generating turn 2 completions ({len(turn2_inputs)} questions).")
+    if use_category_temperatures:
+        completions_turn_2 = _infer_grouped_by_temperature(
+            model_spec=model,
+            provider=provider,
+            max_tokens=max_tokens,
+            model_kwargs=model_kwargs,
+            base_model=chat_model,
+            inputs=turn2_inputs,
+            temperatures=temperatures,
+            use_tqdm=use_tqdm,
+        )
+    else:
+        completions_turn_2 = do_inference(
+            chat_model=chat_model,
+            inputs=turn2_inputs,
+            use_tqdm=use_tqdm,
+        )
+
+    df_outputs = pd.DataFrame(
+        data={
+            "instruction_index": idxs,
+            "completion_turn_1": completions_turn_1,
+            "completion_turn_2": completions_turn_2,
+        },
+    )
+    return df_outputs
+
+
 def generate_base(
     instructions: pd.Series,
     model: str,