openai
diff --git a/‎evals/elsuite/hr_ml_agent_bench/.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎evals/elsuite/hr_ml_agent_bench/.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎evals/elsuite/hr_ml_agent_bench/README.md‎
Lines changed: 226 additions & 0 deletions b/‎evals/elsuite/hr_ml_agent_bench/README.md‎
Lines changed: 226 additions & 0 deletions
diff --git a/‎evals/elsuite/hr_ml_agent_bench/__init__.py‎ b/‎evals/elsuite/hr_ml_agent_bench/__init__.py‎
diff --git a/‎evals/elsuite/hr_ml_agent_bench/actions.py‎
Lines changed: 60 additions & 0 deletions b/‎evals/elsuite/hr_ml_agent_bench/actions.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎evals/elsuite/hr_ml_agent_bench/auto_marking.py‎
Lines changed: 79 additions & 0 deletions b/‎evals/elsuite/hr_ml_agent_bench/auto_marking.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎evals/elsuite/hr_ml_agent_bench/autoeval.py‎
Lines changed: 214 additions & 0 deletions b/‎evals/elsuite/hr_ml_agent_bench/autoeval.py‎
Lines changed: 214 additions & 0 deletions
diff --git a/‎evals/elsuite/hr_ml_agent_bench/benchmarks/__init__.py‎ b/‎evals/elsuite/hr_ml_agent_bench/benchmarks/__init__.py‎
@@ -0,0 +1,7 @@
+benchmarks/babylm/env/babylm_data
+benchmarks/**/prepared
+benchmarks/**/submission.txt
+benchmarks/**/*.checkpoint
+benchmarks/**/*.log
+scripts/**/*.log
+data
@@ -0,0 +1,60 @@
+import json
+import re
+from typing import Optional
+
+from evals.elsuite.hr_ml_agent_bench.high_level_actions import HIGH_LEVEL_ACTIONS
+from evals.elsuite.hr_ml_agent_bench.low_level_actions import LOW_LEVEL_ACTIONS
+from evals.elsuite.hr_ml_agent_bench.schema import Action
+
+ACTION_SPACE = LOW_LEVEL_ACTIONS + HIGH_LEVEL_ACTIONS
+
+
+def make_action_string(name: str, args: dict) -> str:
+    stringified_args = json.dumps(args, indent=4)
+    return f"Action: {name}\nAction Input: {stringified_args}"
+
+
+def get_action(s: str) -> Optional[Action]:
+    """Return an `Action` object from a string representation of an action, if it exists."""
+
+    action_pattern = r"Action:\s*(.+)"
+    args_pattern = r"Action Input:\s*(\{.*?\}|\S.*)"
+
+    action_match = re.search(action_pattern, s)
+    args_match = re.search(args_pattern, s, re.DOTALL)
+
+    if not action_match:
+        return None
+
+    action_name = action_match.group(1).strip()
+    action_args = None
+
+    if args_match:
+        args_str = args_match.group(1).strip()
+
+        try:
+            action_args = json.loads(args_str)
+        except json.JSONDecodeError:
+            action_args = args_str  # Return raw string if JSON parsing fails
+
+    return Action(name=action_name, args=action_args)
+
+
+def is_valid_action(action: Action) -> bool:
+    """Return True if the action has a valid name and arguments, False otherwise."""
+
+    assert isinstance(action, Action)
+
+    if isinstance(action.args, str):
+        return False
+
+    for valid_action in ACTION_SPACE:
+        if action.name != valid_action.name:
+            continue
+
+        actual_args = action.args.keys()
+        expected_args = valid_action.usage.keys()
+
+        return actual_args == expected_args
+
+    return False
@@ -0,0 +1,79 @@
+import importlib
+import json
+from dataclasses import dataclass
+from json import JSONDecodeError
+from pathlib import Path
+
+
+@dataclass(frozen=True)
+class EvaluationResult:
+    # Raw scores in the original unit of the task.
+    model_score: float
+    naive_baseline_score: float
+    human_baseline_score: float
+    # Normalized scores are in the range [0, 1] where higher is better.
+    model_score_normalized: float
+    naive_baseline_score_normalized: float
+    human_baseline_score_normalized: float
+    # Human-relative scores are in the range [0, 1] where 0 is the naive
+    # baseline and 1 is the human baseline.
+    model_score_humanrelative: float
+
+
+def grade_submission(log_dir: Path, task_name: str) -> EvaluationResult:
+    """
+    Grades the submission in `log_dir` using a task-specific grading script.
+    """
+
+    # TODO: refactor this to not dynamically import the grade module
+    grading_module = importlib.import_module(
+        f"evals.elsuite.hr_ml_agent_bench.benchmarks.{task_name}.scripts.grade"
+    )
+
+    # Get baselines
+    naive_baseline_score_raw = grading_module.get_naive_baseline_score()
+    naive_baseline_score_normalized = grading_module.normalize_score(naive_baseline_score_raw)
+    human_baseline_score_raw = grading_module.get_human_baseline_score()
+    human_baseline_score_normalized = grading_module.normalize_score(human_baseline_score_raw)
+
+    traces = list(log_dir.glob("**/trace.json"))
+
+    assert len(traces) == 1, f"Expected to find exactly one submission. Found {len(traces)}."
+
+    best_raw_score = naive_baseline_score_raw
+    best_normalized_score = naive_baseline_score_normalized
+
+    for trace in traces:
+        with open(trace) as f:
+            contents = f.read()
+
+        try:
+            data = json.loads(contents)
+        except JSONDecodeError:
+            continue
+
+        n_steps = len(data["steps"])
+
+        for step in range(n_steps):
+            submission_dir = trace.parent / "traces" / f"step_{step}_files"
+            raw_score = grading_module.get_score(submission_dir)
+            normalized_score = grading_module.normalize_score(raw_score)
+
+            if normalized_score > best_normalized_score:
+                best_raw_score = raw_score
+                best_normalized_score = normalized_score
+
+    # Calculate final human-relative score using normalized scores
+    model_score_humanrelative = (best_normalized_score - naive_baseline_score_normalized) / (
+        human_baseline_score_normalized - naive_baseline_score_normalized
+    )
+
+    return EvaluationResult(
+        model_score=best_raw_score,
+        naive_baseline_score=naive_baseline_score_raw,
+        human_baseline_score=human_baseline_score_raw,
+        model_score_normalized=best_normalized_score,
+        naive_baseline_score_normalized=naive_baseline_score_normalized,
+        human_baseline_score_normalized=human_baseline_score_normalized,
+        model_score_humanrelative=model_score_humanrelative,
+    )
@@ -0,0 +1,214 @@
+import json
+import time
+from dataclasses import dataclass, replace
+from logging import getLogger
+from pathlib import Path
+
+from evals.elsuite.hr_ml_agent_bench.actions import get_action, is_valid_action
+from evals.elsuite.hr_ml_agent_bench.auto_marking import EvaluationResult, grade_submission
+from evals.elsuite.hr_ml_agent_bench.environment import Environment
+from evals.elsuite.hr_ml_agent_bench.prompts import get_task_description
+from evals.elsuite.hr_ml_agent_bench.schema import ActionInfo
+from evals.solvers.solver import Solver
+from evals.task_state import Message, TaskState
+
+logger = getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class Step:
+    step_idx: int
+    action: dict[str, str]
+    observation: str
+
+
+@dataclass(frozen=True)
+class TaskStateMetadata:
+    history_steps: tuple[Step, ...]
+    actions: dict[str, ActionInfo]
+    max_steps_in_context: int
+    max_retries: int
+    max_steps: int
+    log_dir: Path
+    env: Environment
+
+
+@dataclass(frozen=True)
+class FunctionCall:
+    name: str
+    args: dict[str, str]
+
+
+def run(
+    solver: Solver,
+    task_name: str,
+    research_problem: str,
+    log_dir: Path,
+    work_dir: Path,
+    max_steps: int,
+    max_time: int,
+    max_seconds_per_step: int,
+    device: int = 0,
+    python_command: str = "python",
+    resume: bool = False,
+    resume_step: int = 0,
+    max_steps_in_context: int = 3,
+    max_retries: int = 5,
+) -> EvaluationResult:
+    """Evaluates the solver on a given task."""
+
+    env = Environment(
+        log_dir=log_dir / "env_log",
+        work_dir=work_dir / task_name,
+        task=task_name,
+        python_command=python_command,
+        resume=resume,
+        resume_step=resume_step,
+        device=device,
+        max_steps=max_steps,
+        max_time=max_time,
+        solver=solver,
+    )
+
+    task_description = get_task_description(research_problem)
+
+    logger.info(task_description)
+
+    messages = [
+        Message(
+            role="system",
+            content=f"You have a maximum of {max_steps} steps to solve the task. "
+            f"Each step is subject to a maximum time limit of {max_seconds_per_step} "
+            f"seconds. Additionally, your entire attempt is subject to a maximum "
+            f"time limit of {max_time} seconds.",
+        ),
+    ]
+
+    task_state = TaskState(
+        task_description=task_description,
+        messages=messages,
+        current_state=TaskStateMetadata(
+            history_steps=(),
+            actions=env.action_infos,
+            max_steps_in_context=max_steps_in_context,
+            max_retries=max_retries,
+            max_steps=max_steps,
+            log_dir=log_dir,
+            env=env,
+        ),
+    )
+
+    start_time = time.time()
+
+    for step in range(max_steps):
+        time_elapsed = time.time() - start_time
+        time_remaining = max_time - time_elapsed
+
+        task_state = replace(
+            task_state,
+            messages=task_state.messages
+            + [
+                Message(
+                    role="system",
+                    content=f"You have {time_remaining:.2f} seconds and {max_steps - step} steps remaining.",
+                ),
+            ],
+        )
+
+        result = solver(task_state)
+        new_messages = [Message(role="assistant", content=result.output)]
+
+        action = get_action(result.output)
+
+        if action is None or not is_valid_action(action):
+            logger.info(
+                f"Step {step}: [invalid response; no action executed]\n\nAssistant:\n{result.output}"
+            )
+
+            new_messages = task_state.messages + [
+                Message(
+                    role="assistant",
+                    content=result.output,
+                ),
+                Message(
+                    role="system",
+                    content="No valid action found! The list of valid actions was specified at the start; please pick an action from that list.",
+                ),
+            ]
+
+            task_state = TaskState(
+                task_description=task_state.task_description,
+                messages=new_messages,
+                current_state=task_state.current_state,
+            )
+
+            continue
+
+        logger.info(f"\n\nAction: {action.name}\nAction Input: {json.dumps(action.args)}\n")
+
+        new_max_seconds_per_step = min(max_seconds_per_step, time_remaining)
+
+        if attempted_to_use_stable_baselines(result.output):
+            observation = "The action has been terminated because it used the `stable_baselines` "
+            "library, which is not allowed. Please write your implementation from scratch."
+        elif time_remaining > 0:
+            observation = env.execute(action, max_seconds_per_step=new_max_seconds_per_step)
+        else:
+            observation = "Time's up! Your attempt has now exceeded the maximum time limit "
+            f"of {max_time} seconds. The last action attempted was not executed, "
+            "and your current solution will be graded."
+
+        new_messages = task_state.messages + [
+            Message(
+                role="assistant",
+                content=result.output,
+            ),
+            Message(
+                role="system",
+                content=f"Observation:\n\n```\n{observation}\n```",
+            ),
+        ]
+
+        new_history_steps = task_state.current_state.history_steps + (
+            {
+                "step_idx": step,
+                "action": {
+                    "Action": action.name,
+                    "Action Input": json.dumps(action.args, indent=4),
+                },
+                "observation": observation,
+            },
+        )
+
+        new_task_state_metadata = replace(
+            task_state.current_state,
+            history_steps=new_history_steps,
+        )
+
+        task_state = TaskState(
+            task_description=task_state.task_description,
+            messages=new_messages,
+            current_state=new_task_state_metadata,
+        )
+
+        logger.info(f"\n\nObservation:\n```\n{observation}\n```\n")
+
+        env.save(step)
+
+        if env.is_done():
+            break
+
+    env.save("final")
+
+    result = grade_submission(log_dir=log_dir, task_name=task_name)
+
+    return result
+
+
+def attempted_to_use_stable_baselines(s: str) -> bool:
+    s = s.lower()  # be case-insensitive
+
+    if "stable" in s and "baseline" in s:
+        return True
+
+    return False