Platform eval (#5)

ludomitch · web-flow · commit 72e51b3d1772 · 2025-04-07T14:26:37.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
   "google-auth==2.38.0",
   "google-cloud-storage==3.0.0",
   "google-cloud-secret-manager==2.23.0",
-  "crow-client==0.3.6",
+  "crow-client>=0.3.13",
   "jupyter==1.1.1",
   "nbconvert==7.16.6",
   "notebook==7.3.2",
@@ -52,4 +52,4 @@ run_expt = 'scripts.configurable:_run_expt'
 package-dir = {"" = "src"}
 
 [tool.setuptools.packages.find]
-where = ["src"]
+where = ["src"]
diff --git a/src/fhda/config.py b/src/fhda/config.py
@@ -21,3 +21,5 @@
     DATA_STORAGE_PATH = Path("/storage")
 
 EVAL = bool(os.getenv("EVAL", "false").lower() == "true")
+
+VALID_FROM_TASK_KWARGS = ["run_notebook_on_edit"]
diff --git a/src/fhda/data_analysis_env.py b/src/fhda/data_analysis_env.py
@@ -1,6 +1,7 @@
 import hashlib
 import logging
 import shutil
+import json
 from typing import Any, cast
 import time
 from aviary.core import (
@@ -80,8 +81,29 @@ async def submit_answer(self, answer: str) -> str:  # type: ignore[override]
         logger.info("Answer: %s", answer)
         return answer
 
+    def export_frame(self) -> Frame:
+        return Frame(
+            state={
+                "last_action": self.state.actions[-1],
+                "answer": self.state.answer,
+                "done": self.state.done,
+                "total_reward": self.state.total_reward,
+                "nb_state": self.state.nb,
+                "nb_state_html": nb_to_html(self.state.nb),
+                "nb_runtime_errors": self.state.notebook_runtime_errors,
+            },
+            info={
+                "eval_mode": self.eval_mode,
+                "language": self.state.language,
+                "problem": self.problem,
+                "problem_id": self.problem_id,
+            },
+        )
+
     @classmethod
-    def eval_from_task(cls, task: str, gcs_artifact_path: str) -> "DataAnalysisEnv":
+    def eval_from_task(
+        cls, task: str, gcs_artifact_path: str, environment_config: str | None = None
+    ) -> "DataAnalysisEnv":
         """
         Used for evaluations via crow jobs.
 
@@ -90,7 +112,6 @@ def eval_from_task(cls, task: str, gcs_artifact_path: str) -> "DataAnalysisEnv":
             gcs_artifact_path: The path to the GCS artifact – required for evaluation on crow jobs
         """
         logger.info("Using the eval_from_task method")
-
         # Create temporary directory in GCP mounted storage volume
         task_hash = hashlib.sha256(task.encode()).hexdigest()
         trajectory_path = cfg.DATA_STORAGE_PATH / f"{task_hash}-{time.time()}"
@@ -124,45 +145,44 @@ def eval_from_task(cls, task: str, gcs_artifact_path: str) -> "DataAnalysisEnv":
 
     @classmethod
     def from_task(
-        cls, task: str, gcs_artifact_path: str | None = None
+        cls,
+        task: str,
+        gcs_artifact_path: str | None = None,
+        environment_config: str | None = None,
     ) -> "DataAnalysisEnv":
         """
         Perform data analysis on a user query.
 
         Args:
-            task: The user query structured as <data_path> | <query>
-
-        eg "CaspuleFolder-a7812fg | How many genes are differentially expressed between the two conditions?"
+            task: The user query
+            gcs_artifact_path: The path to the GCS artifact – required for evaluation on crow jobs
+            environment_config: A JSON string of environment configuration
         """
         logger.info("User task: %s", task)
         logger.info("GCS artifact path: %s", gcs_artifact_path)
+        logger.info("environment_config: %s", environment_config)
         if cfg.EVAL:
             return cls.eval_from_task(task, gcs_artifact_path)  # type: ignore
 
         if (
-            gcs_artifact_path
+            not gcs_artifact_path
         ):  # The files are already in the GCS bucket in a job-specific directory
-            trajectory_path = cfg.DATA_STORAGE_PATH / gcs_artifact_path
-            nb_path = trajectory_path / NBEnvironment.NOTEBOOK_NAME
-            query = task
-            task_hash = gcs_artifact_path
+            raise NotImplementedError(
+                "Running crow jobs without gcs_artifact_path is not supported"
+            )
+        trajectory_path = cfg.DATA_STORAGE_PATH / gcs_artifact_path
+        nb_path = trajectory_path / NBEnvironment.NOTEBOOK_NAME
+        query = task
+        task_hash = gcs_artifact_path
+        if environment_config:
+            kwargs = {
+                k: v
+                for k, v in json.loads(environment_config).items()
+                if k in cfg.VALID_FROM_TASK_KWARGS
+            }
         else:
-            # Extract data path and query from task
-            data_path, query = task.split("|")
-            # Hash the task to get a unique identifier
-            task_hash = hashlib.sha256(task.encode()).hexdigest()
-            # Create temporary directory in GCP mounted storage volume
-            trajectory_path = cfg.DATA_STORAGE_PATH / f"{task_hash}-{time.time()}"
-            trajectory_path.mkdir(parents=True, exist_ok=True)
-            nb_path = trajectory_path / NBEnvironment.NOTEBOOK_NAME
-            # Copy task data to trajectory path
-            for item in (cfg.DATA_STORAGE_PATH / data_path).iterdir():
-                if item.is_file():
-                    shutil.copy2(item, trajectory_path)
-                elif item.is_dir():
-                    shutil.copytree(
-                        item, trajectory_path / item.name, dirs_exist_ok=True
-                    )
+            kwargs = {}
+        logger.info("Filtered kwargs: %s", kwargs)
 
         # Augment incoming task with CoT instructions
         augmented_task = f"""\
@@ -215,23 +235,5 @@ def from_task(
             language=language,
             system_prompt=prompts.CAPSULE_SYSTEM_PROMPT_QUERY,
             use_tmp_work_dir=False,
-        )
-
-    def export_frame(self) -> Frame:
-        return Frame(
-            state={
-                "last_action": self.state.actions[-1],
-                "answer": self.state.answer,
-                "done": self.state.done,
-                "total_reward": self.state.total_reward,
-                "nb_state": self.state.nb,
-                "nb_state_html": nb_to_html(self.state.nb),
-                "nb_runtime_errors": self.state.notebook_runtime_errors,
-            },
-            info={
-                "eval_mode": self.eval_mode,
-                "language": self.state.language,
-                "problem": self.problem,
-                "problem_id": self.problem_id,
-            },
+            **kwargs,
         )
diff --git a/src/fhda/notebook_env.py b/src/fhda/notebook_env.py
@@ -125,6 +125,7 @@ def __init__(
         use_tmp_work_dir: bool = True,
         language: utils.NBLanguage = utils.NBLanguage.PYTHON,
         allow_download_from_gcs: bool = False,
+        run_notebook_on_edit: bool = False,
     ):
         """Initialize a notebook environment.
 
@@ -139,6 +140,8 @@ def __init__(
             allow_download_from_gcs: If True, the environment will expose a tool to download
                 directories from the aviary-storage GCS bucket. Should only be enabled if the
                 task requires data on GCS. Disabled by default.
+            run_notebook_on_edit: If True (default), the whole notebook will be rerun
+                after each edit. If False, only a the cell that was edited will be rerun.
         """
         self.work_dir = Path(work_dir)
         self.nb_path = Path(nb_path) if nb_path else self.work_dir / self.NOTEBOOK_NAME
@@ -147,6 +150,7 @@ def __init__(
         self.language = language
         self.allow_download_from_gcs = allow_download_from_gcs
         self.use_docker = cfg.USE_DOCKER
+        self.run_notebook_on_edit = run_notebook_on_edit
 
     async def reset(self) -> tuple[Messages, list[Tool]]:
         nb_path, work_dir = self._set_work_dir()
@@ -218,7 +222,7 @@ async def edit_cell(self, contents: str, idx: int | None = None) -> str:
 
         ONLY CODE CELLS ARE SUPPORTED. Do no attempt to write Markdown or raw text,
         though you are permitted (and encouraged) to write comments in the code cells.
-        The notebook will be automatically rerun if a successful edit is made.
+        The cell will be automatically rerun if a successful edit is made.
 
         Args:
             contents: Cell contents to insert. We assume the cell is a code block.
@@ -242,7 +246,12 @@ async def edit_cell(self, contents: str, idx: int | None = None) -> str:
             return f"Edited cell #{idx}."
         finally:
             self.state.save_nb()
-            await self.run_notebook()
+            if self.run_notebook_on_edit:
+                args = {}
+            else:
+                idx = len(self.state.cells) - 1 if idx is None else idx
+                args = {"cell_idx": idx}
+            await self.run_notebook(**args)
 
     def list_workdir(self) -> str:
         """Recursively lists the contents of the working directory.
@@ -283,12 +292,14 @@ def _list_dir(self, path: Path) -> TListDir:
                 cast(list, index["files"]).append(item.name)
         return index
 
-    async def run_notebook(self) -> str:
+    async def run_notebook(self, cell_idx: int | None = None) -> str:
         """Run the entire notebook sequentially."""
         logger.debug("Starting notebook execution")
         if self.use_docker:
+            if cell_idx is not None:
+                raise ValueError("Cell index not supported for Docker")
             return await self._run_notebook_docker()
-        return await self._run_notebook_local()
+        return await self._run_notebook_local(cell_idx=cell_idx)
 
     async def _run_notebook_docker(self) -> str:
         """Run notebook using Docker container."""
@@ -325,12 +336,12 @@ async def _run_notebook_docker(self) -> str:
         self.state.reload_nb()
         return "Executed all cells."
 
-    async def _run_notebook_local(self) -> str:
+    async def _run_notebook_local(self, cell_idx: int | None = None) -> str:
         """Run notebook using local kernel."""
         client = self.state.kernel_manager.client()
         client.start_channels()
         error_messages = await utils.nbformat_run_notebook(
-            cells=self.state.cells, client=client
+            cells=self.state.cells, client=client, cell_idx=cell_idx
         )
         if error_messages:
             self.state.notebook_runtime_errors.extend(error_messages)
diff --git a/src/fhda/utils.py b/src/fhda/utils.py
@@ -148,7 +148,9 @@ def encode_image_to_base64(image: str) -> str:
 
 
 async def nbformat_run_notebook(
-    cells: Iterable[nbformat.NotebookNode], client: "AsyncKernelClient"
+    cells: Iterable[nbformat.NotebookNode],
+    client: "AsyncKernelClient",
+    cell_idx: int | None = None,
 ) -> list[str]:
     """Execute notebook cells using a kernel client and collect outputs.
 
@@ -163,9 +165,13 @@ async def nbformat_run_notebook(
         List of error messages from cells that raised an error
     """
     error_messages = []
+    logger.debug(f"Running notebook with cell_idx: {cell_idx}")
     try:
         logger.debug("Beginning cell execution")
         for idx, cell in enumerate(cells):
+            if cell_idx is not None and idx != cell_idx:
+                logger.debug(f"Skipping cell {idx} because cell_idx is {cell_idx}")
+                continue
             if cell.cell_type == "code":
                 logger.debug(f"Executing code cell {idx}")
                 cell.outputs = []  # Initialize empty outputs list
diff --git a/src/scripts/deploy.py b/src/scripts/deploy.py
@@ -11,7 +11,7 @@
 )
 from crow_client.models.app import TaskQueuesConfig
 
-EVAL = True
+EVAL = False
 
 ENV_VARS = {
     "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"],
@@ -33,27 +33,28 @@
     CrowDeploymentConfig(
         requirements_path=Path("pyproject.toml"),
         path=Path("src"),
-        name="bixbench-crow" if EVAL else "data-analysis-crow",
+        name="bixbench-crow2" if EVAL else "data-analysis-crow",
         environment="src.fhda.data_analysis_env.DataAnalysisEnv",
         environment_variables=ENV_VARS,
         agent="ldp.agent.ReActAgent",
         container_config=CONTAINER_CONFIG,
         force=True,
         frame_paths=frame_paths,
-        timeout=1200,
+        timeout=3600,
         task_queues_config=TaskQueuesConfig(
-            name="bixbench-crow" if EVAL else "data-analysis-crow",
+            name="bixbench-crow2" if EVAL else "data-analysis-crow",
             max_running_jobs=300,
         ),
     ),
 ]
 
 if __name__ == "__main__":
     client = CrowClient(
-        stage=Stage.from_string(os.environ.get("CROW_ENV", "DEV")),
+        # stage=Stage.from_string(os.environ.get("CROW_ENV", ENV_VARS["STAGE"])),
+        stage=Stage.from_string(os.environ.get("CROW_ENV", "LOCAL")),
         organization="FutureHouse",
         auth_type=AuthType.API_KEY,
-        api_key=os.environ["CROW_API_KEY"],
+        api_key=os.environ[f"CROW_API_KEY_{ENV_VARS['STAGE']}"],
     )
     for crow in CROWS_TO_DEPLOY:
         try:
diff --git a/src/scripts/platform_eval.py b/src/scripts/platform_eval.py
diff --git a/src/scripts/platform_run_jobs.py b/src/scripts/platform_run_jobs.py