addressing comments and fixing impl

akshathmangudi · akshathmangudi · commit 3138c7c79b70 · 2025-12-14T12:07:47.000+05:30
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/constants.py b/src/lighteval/tasks/tasks/long_horizon_execution/constants.py
@@ -28,7 +28,7 @@
 
 PROMPT_TEMPLATE_MULTI_START = """You are an AI assistant. I will provide you with a dictionary and then give you keys in groups of {k}.
 Your task is to keep a running total (starting from 0) by adding the values associated with the keys I provide.
-In each turn, I'll provide {k} keys (comma-separated).
+In each turn, I'll provide {k} key(s) (comma-separated).
 Respond with the current running sum, enclosed in <answer> tags.
 
 Dictionary to maintain:
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/main.py b/src/lighteval/tasks/tasks/long_horizon_execution/main.py
@@ -44,37 +44,11 @@
 from lighteval.metrics.metrics import Metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES
+from lighteval.tasks.tasks.long_horizon_execution.constants import CONTEXT_SIZES, PROMPT_TEMPLATE_SINGLE
 from lighteval.tasks.tasks.long_horizon_execution.multi_turn import create_multi_turn_tasks
 from lighteval.tasks.tasks.long_horizon_execution.utils import _build_prompt_and_target
 
 
-# Single-turn prompt template
-PROMPT_TEMPLATE_SINGLE = """You are an AI assistant. I will provide you with a dictionary and then give you a list of keys.
-Your task is to calculate the final cumulative sum after processing all keys in order.
-
-For each key in the list, you need to:
-1. Look up the value in the dictionary
-2. Add it to the running sum
-3. After processing all keys, output the final cumulative sum
-
-Dictionary to use:
-{dict_str}
-
-Keys to process in order:
-{keys_str}
-
-Your task: Process all keys in order and calculate the final cumulative sum after processing all {num_keys} keys.
-
-IMPORTANT:
-- Output your answer as a single integer value inside <answer></answer> tags
-- Do not include any other text outside the answer tags
-- Format: <answer>final_sum</answer>
-- Example: If the final cumulative sum is 42, output: <answer>42</answer>
-
-Your answer:"""
-
-
 def single_turn_prompt_function(line, prompt_length=32768, task_name: str = None):
     """
     Prompt function for single-turn evaluation (non-inspect-ai backend).
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py b/src/lighteval/tasks/tasks/long_horizon_execution/multi_turn.py
@@ -67,14 +67,14 @@ def _extract_response_content(response):
     return str(response)
 
 
-async def _process_single_turn(state, turn_chunk, generate):
+async def _process_single_turn(state, turn_chunk, generate_fn):
     """Process a single turn: add user message, get model response, add assistant message."""
     keys_str = ", ".join(turn_chunk)
     followup_prompt = PROMPT_TEMPLATE_MULTI_FOLLOWUP.format(keys_str=keys_str)
     state.messages.append(ChatMessageUser(content=followup_prompt))
 
-    # generate() takes the state and returns updated state with assistant message added
-    updated_state = await generate(state)
+    # generate_fn() takes the state and returns updated state with assistant message added
+    updated_state = await generate_fn(state)
     turn_response = _extract_response_content(updated_state.output.completion if updated_state.output else "")
 
     return updated_state, turn_response
@@ -91,7 +91,7 @@ def multi_turn_solver():
     async def solve(state: TaskState, generate: Generate):
         turn_chunks = state.metadata.get("turn_chunks", [])
 
-        if not turn_chunks or len(turn_chunks) == 0:
+        if not turn_chunks:
             return state
 
         # Initialize messages
@@ -129,7 +129,7 @@ async def solve(state: TaskState, generate: Generate):
     return solve
 
 
-@scorer(metrics={"turn_accuracy": [accuracy(), stderr()], "fractional_accuracy": [accuracy(), stderr()]})
+@scorer(metrics={"fractional_accuracy": [accuracy(), stderr()]})
 def multi_turn_scorer():
     """
     Scorer for multi-turn Long Horizon Execution task.
@@ -143,19 +143,23 @@ async def score(state: TaskState, target: Target):
         expected_per_turn = state.metadata.get("expected_per_turn", [])
 
         if not all_turn_outputs:
-            return Score(value=0.0, answer="", explanation="No turn outputs found in state.metadata")
+            return Score(
+                value={"fractional_accuracy": 0.0},
+                answer="",
+                explanation="No turn outputs found in state.metadata",
+            )
 
         if len(all_turn_outputs) != len(expected_per_turn):
             return Score(
-                value=0.0,
+                value={"fractional_accuracy": 0.0},
                 answer="",
                 explanation=f"Mismatch: {len(all_turn_outputs)} outputs vs {len(expected_per_turn)} expected turns",
             )
 
         parsed_outputs = []
         answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
 
-        for turn_idx, turn_output in enumerate(all_turn_outputs):
+        for turn_output in all_turn_outputs:
             match = answer_pattern.search(turn_output)
             if match:
                 try:
@@ -177,12 +181,7 @@ async def score(state: TaskState, target: Target):
         fractional_accuracy = correct_turns / len(expected_per_turn) if expected_per_turn else 0.0
 
         return Score(
-            value={
-                "turn_accuracy": fractional_accuracy,
-                "fractional_accuracy": fractional_accuracy,
-                "correct_turns": correct_turns,
-                "total_turns": len(expected_per_turn),
-            },
+            value={"fractional_accuracy": fractional_accuracy},
             answer=str(parsed_outputs),
             explanation=f"Correct {correct_turns}/{len(expected_per_turn)} turns. Details: {turn_results}",
         )
diff --git a/src/lighteval/tasks/tasks/long_horizon_execution/utils.py b/src/lighteval/tasks/tasks/long_horizon_execution/utils.py
@@ -126,7 +126,9 @@ def build_initial_prompt_for_n(n):
         keys_str = ", ".join(first_turn_keys)
 
         return PROMPT_TEMPLATE_MULTI_START.format(
-            dict_str=dict_str, keys_str=keys_str, k=k, num_keys=len(first_turn_keys)
+            dict_str=dict_str,
+            keys_str=keys_str,
+            k=k,
         )
 
     return _binary_search_max_items(input_keys, build_initial_prompt_for_n, prompt_length, min_items=k)
@@ -169,7 +171,6 @@ def _build_multi_turn_prompts(record, prompt_length=32768, k=1):
     """
     input_keys = record["input"]
     input_values = record["values"]
-    expected_output = record["output"]
 
     # Handle empty input case
     if len(input_keys) == 0:
@@ -181,21 +182,17 @@ def _build_multi_turn_prompts(record, prompt_length=32768, k=1):
     # Use the maximum n that fits
     input_keys = input_keys[:max_n]
     input_values = input_values[:max_n]
-    expected_output = expected_output[:max_n]
 
-    turn_chunks, value_chunks, expected_per_turn = _chunk_and_calculate_expected(input_keys, input_values, k)
+    turn_chunks, _, expected_per_turn = _chunk_and_calculate_expected(input_keys, input_values, k)
 
     dictionary = dict(zip(input_keys, input_values))
     dict_str = str(dictionary)
 
     first_turn_keys_str = ", ".join(turn_chunks[0])
-    initial_prompt = PROMPT_TEMPLATE_MULTI_START.format(
-        dict_str=dict_str, keys_str=first_turn_keys_str, k=k, num_keys=len(turn_chunks[0])
-    )
+    initial_prompt = PROMPT_TEMPLATE_MULTI_START.format(dict_str=dict_str, keys_str=first_turn_keys_str, k=k)
 
     metadata = {
         "turn_chunks": turn_chunks,
-        "value_chunks": value_chunks,
         "expected_per_turn": expected_per_turn,
         "dictionary": dictionary,
         "k": k,