Update readme with arxiv url + catch agentic typing issue with tool use

ludomitch · ludomitch · commit ace7ca90c882 · 2025-03-04T11:33:26.000-08:00
diff --git a/README.md b/README.md
@@ -79,7 +79,7 @@ BixBench tests AI agents' ability to:
 - Perform long, multi-step computational analyses
 - Interpret nuanced results in the context of a research question
 
-You can find the BixBench dataset in [Hugging Face](https://huggingface.co/datasets/futurehouse/BixBench), the paper [here](https://storage.googleapis.com/bixbench-results/BixBench.pdf), and the blog post [here](https://futurehouse.org/blog/bixbench/).
+You can find the BixBench dataset in [Hugging Face](https://huggingface.co/datasets/futurehouse/BixBench), the paper [here](https://arxiv.org/abs/2503.00096), and the blog post [here](https://www.futurehouse.org/research-announcements/bixbench).
 
 ### Running BixBench Evaluations
 
diff --git a/src/fhda/notebook_env.py b/src/fhda/notebook_env.py
@@ -225,6 +225,12 @@ async def edit_cell(self, contents: str, idx: int | None = None) -> str:
                 then appends a new cell.
         """
         try:
+            # Sometimes the agent will try to enter a string instead of an int
+            if idx is not None:
+                try:
+                    idx = int(idx)
+                except (ValueError, TypeError):
+                    idx = None
             if idx is None or idx >= len(self.state.cells):
                 new_cell = nbformat.v4.new_code_cell(source=contents)
                 self.state.cells.append(new_cell)
diff --git a/tutorial/example.ipynb b/tutorial/example.ipynb
@@ -6,7 +6,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "import os\n",
     "import hashlib\n",
     "import shutil\n",
@@ -20,7 +19,7 @@
     "from fhda.data_analysis_env import DataAnalysisEnv\n",
     "from fhda.notebook_env import NBEnvironment\n",
     "from fhda.utils import NBLanguage\n",
-    "from fhda import prompts\n"
+    "from fhda import prompts"
    ]
   },
   {
@@ -39,7 +38,8 @@
     "# This image includes many bioinformatics and data science packages\n",
     "os.environ[\"use_docker\"] = \"False\"\n",
     "\n",
-    "def setup_data_analysis_env(query: str, dataset_folder: Path):   \n",
+    "\n",
+    "def setup_data_analysis_env(query: str, dataset_folder: Path):\n",
     "    # Hash the task to get a unique identifier\n",
     "    task_hash = hashlib.sha256(query.encode()).hexdigest()\n",
     "    trajectory_path = Path(\"tmp_results_dir\") / f\"{task_hash}-{time.time()}\"\n",
@@ -50,9 +50,7 @@
     "        if item.is_file():\n",
     "            shutil.copy2(item, trajectory_path)\n",
     "        elif item.is_dir():\n",
-    "            shutil.copytree(\n",
-    "                item, trajectory_path / item.name, dirs_exist_ok=True\n",
-    "            )\n",
+    "            shutil.copytree(item, trajectory_path / item.name, dirs_exist_ok=True)\n",
     "\n",
     "    # Augment incoming task with CoT instructions\n",
     "    augmented_task = f\"\"\"\\\n",
@@ -66,7 +64,7 @@
     "    {prompts.GENERAL_NOTEBOOK_GUIDELINES}\"\"\"\n",
     "\n",
     "    # This can be R or PYTHON in Docker or with a local kernel if you have R installed\n",
-    "    language = NBLanguage.PYTHON  \n",
+    "    language = NBLanguage.PYTHON\n",
     "    if language == NBLanguage.R:\n",
     "        augmented_task += f\"\\n{prompts.R_OUTPUT_RECOMMENDATION_PROMPT}\"\n",
     "\n",
@@ -144,7 +142,7 @@
     "    agent_type=\"ReActAgent\",\n",
     "    agent_kwargs={\n",
     "        \"llm_model\": {\n",
-    "            \"model\": \"claude-3-7-sonnet-20250219\", # any litellm supported model will work\n",
+    "            \"model\": \"claude-3-7-sonnet-20250219\",  # any litellm supported model will work\n",
     "            \"parallel_tool_calls\": False,\n",
     "            \"num_retries\": 5,\n",
     "            \"temperature\": 1.0,\n",
@@ -983,7 +981,7 @@
     "trajectory = result[0]\n",
     "# You can inspect each step in the trajectory and see what the agent's reasoning was,\n",
     "# what tool it called, and what the observation was\n",
-    "for c,step in enumerate(trajectory.steps):\n",
+    "for c, step in enumerate(trajectory.steps):\n",
     "    print(f\"Timestep {c}\")\n",
     "    print(f\"Done: {step.done}\")\n",
     "    print(\"Agent Reasoning:\")\n",
@@ -1040,8 +1038,7 @@
     "        break\n",
     "\n",
     "    agent_state = next_agent_state\n",
-    "    obs = next_obs\n",
-    "\n"
+    "    obs = next_obs"
    ]
   }
  ],