Skip to content

Commit ace7ca9

Browse files
committed
Update readme with arxiv url + catch agentic typing issue with tool use
1 parent e13b772 commit ace7ca9

File tree

3 files changed

+15
-12
lines changed

3 files changed

+15
-12
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ BixBench tests AI agents' ability to:
7979
- Perform long, multi-step computational analyses
8080
- Interpret nuanced results in the context of a research question
8181

82-
You can find the BixBench dataset in [Hugging Face](https://huggingface.co/datasets/futurehouse/BixBench), the paper [here](https://storage.googleapis.com/bixbench-results/BixBench.pdf), and the blog post [here](https://futurehouse.org/blog/bixbench/).
82+
You can find the BixBench dataset in [Hugging Face](https://huggingface.co/datasets/futurehouse/BixBench), the paper [here](https://arxiv.org/abs/2503.00096), and the blog post [here](https://www.futurehouse.org/research-announcements/bixbench).
8383

8484
### Running BixBench Evaluations
8585

src/fhda/notebook_env.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,12 @@ async def edit_cell(self, contents: str, idx: int | None = None) -> str:
225225
then appends a new cell.
226226
"""
227227
try:
228+
# Sometimes the agent will try to enter a string instead of an int
229+
if idx is not None:
230+
try:
231+
idx = int(idx)
232+
except (ValueError, TypeError):
233+
idx = None
228234
if idx is None or idx >= len(self.state.cells):
229235
new_cell = nbformat.v4.new_code_cell(source=contents)
230236
self.state.cells.append(new_cell)

tutorial/example.ipynb

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"metadata": {},
77
"outputs": [],
88
"source": [
9-
"\n",
109
"import os\n",
1110
"import hashlib\n",
1211
"import shutil\n",
@@ -20,7 +19,7 @@
2019
"from fhda.data_analysis_env import DataAnalysisEnv\n",
2120
"from fhda.notebook_env import NBEnvironment\n",
2221
"from fhda.utils import NBLanguage\n",
23-
"from fhda import prompts\n"
22+
"from fhda import prompts"
2423
]
2524
},
2625
{
@@ -39,7 +38,8 @@
3938
"# This image includes many bioinformatics and data science packages\n",
4039
"os.environ[\"use_docker\"] = \"False\"\n",
4140
"\n",
42-
"def setup_data_analysis_env(query: str, dataset_folder: Path): \n",
41+
"\n",
42+
"def setup_data_analysis_env(query: str, dataset_folder: Path):\n",
4343
" # Hash the task to get a unique identifier\n",
4444
" task_hash = hashlib.sha256(query.encode()).hexdigest()\n",
4545
" trajectory_path = Path(\"tmp_results_dir\") / f\"{task_hash}-{time.time()}\"\n",
@@ -50,9 +50,7 @@
5050
" if item.is_file():\n",
5151
" shutil.copy2(item, trajectory_path)\n",
5252
" elif item.is_dir():\n",
53-
" shutil.copytree(\n",
54-
" item, trajectory_path / item.name, dirs_exist_ok=True\n",
55-
" )\n",
53+
" shutil.copytree(item, trajectory_path / item.name, dirs_exist_ok=True)\n",
5654
"\n",
5755
" # Augment incoming task with CoT instructions\n",
5856
" augmented_task = f\"\"\"\\\n",
@@ -66,7 +64,7 @@
6664
" {prompts.GENERAL_NOTEBOOK_GUIDELINES}\"\"\"\n",
6765
"\n",
6866
" # This can be R or PYTHON in Docker or with a local kernel if you have R installed\n",
69-
" language = NBLanguage.PYTHON \n",
67+
" language = NBLanguage.PYTHON\n",
7068
" if language == NBLanguage.R:\n",
7169
" augmented_task += f\"\\n{prompts.R_OUTPUT_RECOMMENDATION_PROMPT}\"\n",
7270
"\n",
@@ -144,7 +142,7 @@
144142
" agent_type=\"ReActAgent\",\n",
145143
" agent_kwargs={\n",
146144
" \"llm_model\": {\n",
147-
" \"model\": \"claude-3-7-sonnet-20250219\", # any litellm supported model will work\n",
145+
" \"model\": \"claude-3-7-sonnet-20250219\", # any litellm supported model will work\n",
148146
" \"parallel_tool_calls\": False,\n",
149147
" \"num_retries\": 5,\n",
150148
" \"temperature\": 1.0,\n",
@@ -983,7 +981,7 @@
983981
"trajectory = result[0]\n",
984982
"# You can inspect each step in the trajectory and see what the agent's reasoning was,\n",
985983
"# what tool it called, and what the observation was\n",
986-
"for c,step in enumerate(trajectory.steps):\n",
984+
"for c, step in enumerate(trajectory.steps):\n",
987985
" print(f\"Timestep {c}\")\n",
988986
" print(f\"Done: {step.done}\")\n",
989987
" print(\"Agent Reasoning:\")\n",
@@ -1040,8 +1038,7 @@
10401038
" break\n",
10411039
"\n",
10421040
" agent_state = next_agent_state\n",
1043-
" obs = next_obs\n",
1044-
"\n"
1041+
" obs = next_obs"
10451042
]
10461043
}
10471044
],

0 commit comments

Comments
 (0)