Skip to content
Open
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions src/lighteval/tasks/tasks/multi_challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""
name:
MultiChallenge

dataset:
nmayorga7/multichallenge

abstract:
MultiChallenge evaluates large language models (LLMs) on their ability to
conduct multi-turn conversations with human users.
The model is given a target question belonging to one or
more axes (categories) and must provide a free-form answer.
The evaluation uses a secondary judge model to determine if the
answer satisfies the pass criteria for that question.

languages:
english

tags:
conversational, generation

paper:
https://arxiv.org/abs/2501.17399
"""

import re

from inspect_ai.dataset import Sample
from inspect_ai.model import get_model
from inspect_ai.scorer import Score, Target, accuracy, scorer, stderr
from inspect_ai.solver import TaskState, generate

from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc


JUDGE_PROMPT = """You are tasked with evaluating a model response to see if it meets a specific criteria.
The criteria will always be YES/NO evaluation.

The model response is as follows:
<MODEL_RESPONSE>
{}
</MODEL_RESPONSE>

The criteria that the model response must meet is as follows. Be VERY STRICT!:
<CRITERIA>
{}
</CRITERIA>

Print your reasoning followed by your verdict, either "YES" or "NO"."""


def format_conversation(conversation: list[dict]) -> str:
"""Format conversation messages into a single string for model input."""
formatted_messages = []
for msg in conversation:
role = msg["role"].upper()
content = msg["content"]
formatted_messages.append(f"{role}:\n{content}")

return "\n\n".join(formatted_messages)


def multi_challenge_prompt(line, task_name: str = None):
"""Convert dataset to Doc object"""

conversation = line["CONVERSATION"]
formatted_conv = format_conversation(conversation)
return Doc(
task_name=task_name,
query=formatted_conv,
instruction=None,
specific={
"question_id": line["QUESTION_ID"],
"axis": line["AXIS"],
"target_question": line["TARGET_QUESTION"],
"pass_criteria": line["PASS_CRITERIA"],
"conversation": conversation,
},
)


@scorer(metrics=[accuracy(), stderr()])
def multi_challenge_scorer():
async def score(state: TaskState, target: Target):
response = state.output.completion

target_question = target.text
pass_criteria = state.metadata.get("pass_criteria", "YES")

if not target_question:
return Score(
value="I",
answer=response,
explanation="Target question not found.",
)

try:
judge_model = get_model("openai/gpt-4o-2024-08-06")
judge_prompt = JUDGE_PROMPT.format(response, target_question)

judge_result = await judge_model.generate(judge_prompt)
judge_output = judge_result.completion

verdict_match = re.search(r"\b(YES|NO)\b", judge_output, re.IGNORECASE)

if not verdict_match:
return Score(
value="I",
answer=response,
explanation=f"Could not extract verdict from judge output: {judge_output}.",
)

judge_verdict = verdict_match.group(1).upper()
passed = judge_verdict == pass_criteria

return Score(
value="C" if passed else "I",
answer=response,
explanation=f"Judge verdict: {judge_verdict}, Expected: {pass_criteria}, Response: {response}.",
)

except Exception as e:
return Score(
value="I",
answer=response,
explanation=f"Error during judge evaluation: {str(e)}.",
)

return score


def record_to_sample(record: dict) -> Sample:
"""Convert dataset record to inspect-ai Sample object."""
conversation = record["CONVERSATION"]
formatted_conv = format_conversation(conversation)

return Sample(
input=formatted_conv,
target=record["TARGET_QUESTION"],
metadata={
"question_id": record["QUESTION_ID"],
"axis": record["AXIS"],
"pass_criteria": record["PASS_CRITERIA"],
"conversation": conversation,
},
)


multi_challenge = LightevalTaskConfig(
name="multi_challenge",
prompt_function=multi_challenge_prompt,
hf_repo="nmayorga7/multichallenge",
hf_subset="default",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select=None,
generation_size=2048,
stop_sequence=[],
version=0,
sample_fields=record_to_sample,
metrics=[], # Metrics are defined in the scorer decorator for inspect-ai tasks
solver=[generate(cache=True)],
scorer=multi_challenge_scorer(),
)

TASKS_TABLE = [multi_challenge]