@@ -34,15 +34,15 @@ limitations under the License.
โจ **Simplicity**: the logic for agents fits in ~1,000 lines of code (see [agents.py](https://github.com/huggingface/smolagents/blob/main/src/smolagents/agents.py)). We kept abstractions to their minimal shape above raw code!
-๐งโ๐ป **First-class support for Code Agents**. Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) writes its actions in code (as opposed to "agents being used to write code"). To make it secure, we support executing in sandboxed environments via [E2B](https://e2b.dev/).
+๐งโ๐ป **First-class support for Code Agents**. Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) writes its actions in code (as opposed to "agents being used to write code"). To make it secure, we support executing in sandboxed environments via [E2B](https://e2b.dev/) or via Docker.
-๐ค **Hub integrations**: you can [share/pull tools to/from the Hub](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_hub), and more is to come!
+๐ค **Hub integrations**: you can [share/pull tools or agents to/from the Hub](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_hub) for instant sharing of the most efficient agents!
๐ **Model-agnostic**: smolagents supports any LLM. It can be a local `transformers` or `ollama` model, one of [many providers on the Hub](https://huggingface.co/blog/inference-providers), or any model from OpenAI, Anthropic and many others via our [LiteLLM](https://www.litellm.ai/) integration.
๐๏ธ **Modality-agnostic**: Agents support text, vision, video, even audio inputs! Cf [this tutorial](https://huggingface.co/docs/smolagents/examples/web_browser) for vision.
-๐ ๏ธ **Tool-agnostic**: you can use tools from [LangChain](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_langchain), [Anthropic's MCP](https://huggingface.co/docs/smolagents/reference/tools#smolagents.ToolCollection.from_mcp), you can even use a [Hub Space](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_space) as a tool.
+๐ ๏ธ **Tool-agnostic**: you can use tools from [LangChain](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_langchain), [MCP](https://huggingface.co/docs/smolagents/reference/tools#smolagents.ToolCollection.from_mcp), you can even use a [Hub Space](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_space) as a tool.
Full documentation can be found [here](https://huggingface.co/docs/smolagents/index).
@@ -57,9 +57,9 @@ pip install smolagents
```
Then define your agent, give it the tools it needs and run it!
```py
-from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel
+from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel
-model = HfApiModel()
+model = InferenceClientModel()
agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)
agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")
@@ -67,7 +67,7 @@ agent.run("How many seconds would it take for a leopard at full speed to run thr
https://github.com/user-attachments/assets/cd0226e2-7479-4102-aea0-57c22ca47884
-You can even share your agent to hub:
+You can even share your agent to the Hub, as a Space repository:
```py
agent.push_to_hub("m-ric/my_agent")
@@ -77,12 +77,12 @@ agent.push_to_hub("m-ric/my_agent")
Our library is LLM-agnostic: you could switch the example above to any inference provider.
-HfApiModel, gateway for 4 inference providers
+InferenceClientModel, gateway for all inference providers supported on HF
```py
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
-model = HfApiModel(
+model = InferenceClientModel(
model_id="deepseek-ai/DeepSeek-R1",
provider="together",
)
@@ -95,7 +95,7 @@ model = HfApiModel(
from smolagents import LiteLLMModel
model = LiteLLMModel(
- "anthropic/claude-3-5-sonnet-latest",
+ model_id="anthropic/claude-3-5-sonnet-latest",
temperature=0.2,
api_key=os.environ["ANTHROPIC_API_KEY"]
)
@@ -143,6 +143,18 @@ model = AzureOpenAIServerModel(
)
```
+
+Amazon Bedrock models
+
+```py
+import os
+from smolagents import AmazonBedrockServerModel
+
+model = AmazonBedrockServerModel(
+ model_id = os.environ.get("AMAZON_BEDROCK_MODEL_ID")
+)
+```
+
## CLI
@@ -151,7 +163,7 @@ You can run agents from CLI using two commands: `smolagent` and `webagent`.
`smolagent` is a generalist command to run a multi-step `CodeAgent` that can be equipped with various tools.
```bash
-smolagent "Plan a trip to Tokyo, Kyoto and Osaka between Mar 28 and Apr 7." --model-type "HfApiModel" --model-id "Qwen/Qwen2.5-Coder-32B-Instruct" --imports "pandas numpy" --tools "web_search"
+smolagent "Plan a trip to Tokyo, Kyoto and Osaka between Mar 28 and Apr 7." --model-type "InferenceClientModel" --model-id "Qwen/Qwen2.5-Coder-32B-Instruct" --imports "pandas numpy" --tools "web_search"
```
Meanwhile `webagent`ย is a specific web-browsing agent using [helium](https://github.com/mherrmann/helium) (read more [here](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py)).
@@ -201,7 +213,7 @@ Writing actions as code snippets is demonstrated to work better than the current
Especially, since code execution can be a security concern (arbitrary code execution!), we provide options at runtime:
- a secure python interpreter to run code more safely in your environment (more secure than raw code execution but still risky)
- - a sandboxed environment using [E2B](https://e2b.dev/) (removes the risk to your own system).
+ - a sandboxed environment using [E2B](https://e2b.dev/) or Docker (removes the risk to your own system).
On top of this [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) class, we still support the standard [`ToolCallingAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.ToolCallingAgent) that writes actions as JSON/text blobs. But we recommend always using `CodeAgent`.
@@ -216,7 +228,7 @@ By the way, why use a framework at all? Well, because a big part of this stuff i
We've created [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) instances with some leading models, and compared them on [this benchmark](https://huggingface.co/datasets/m-ric/agents_medium_benchmark_2) that gathers questions from a few different benchmarks to propose a varied blend of challenges.
-[Find the benchmarking code here](https://github.com/huggingface/smolagents/blob/main/examples/benchmark.ipynb) for more detail on the agentic setup used, and see a comparison of using LLMs code agents compared to vanilla (spoilers: code agents works better).
+[Find the benchmarking code here](https://github.com/huggingface/smolagents/blob/main/examples/smolagents_benchmark/run.py) for more detail on the agentic setup used, and see a comparison of using LLMs code agents compared to vanilla (spoilers: code agents works better).
@@ -224,6 +236,14 @@ We've created [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/age
This comparison shows that open-source models can now take on the best closed models!
+## Security
+
+Security is a critical consideration when working with code-executing agents. Our library provides:
+- Sandboxed execution options using [E2B](https://e2b.dev/) or Docker
+- Best practices for running agent code securely
+
+For security policies, vulnerability reporting, and more information on secure agent execution, please see our [Security Policy](SECURITY.md).
+
## Contribute
Everyone is welcome to contribute, get started with our [contribution guide](https://github.com/huggingface/smolagents/blob/main/CONTRIBUTING.md).
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..0a55a5631
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,9 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+To report a security vulnerability, please contact: security@huggingface.co
+
+## Learning More About Security
+
+To learn more about running agents more securely, please see the [Secure Code Execution tutorial](docs/source/en/tutorials/secure_code_execution.mdx) which covers sandboxing with E2B and Docker.
diff --git a/docs/README.md b/docs/README.md
index be716450b..af4b61c6c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -121,10 +121,6 @@ Adding a new tutorial or section is done in two steps:
Make sure to put your new file under the proper section. If you have a doubt, feel free to ask in a Github Issue or PR.
-### Translating
-
-When translating, refer to the guide at [./TRANSLATING.md](https://github.com/huggingface/smolagents/blob/main/docs/TRANSLATING.md).
-
### Writing source documentation
Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
@@ -271,4 +267,5 @@ is to be used in inference and also include the expected (ideally sensible)
output.
Often, readers will try out the example before even going through the function
or class definitions. Therefore, it is of utmost importance that the example
-works as expected.
\ No newline at end of file
+works as expected.
+
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c1efd31dc..c5c2a9a93 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -13,7 +13,7 @@
- local: tutorials/tools
title: ๐ ๏ธ Tools - in-depth guide
- local: tutorials/secure_code_execution
- title: ๐ก๏ธ Secure your code execution with E2B
+ title: ๐ก๏ธ Secure code execution
- local: tutorials/memory
title: ๐ Manage your agent's memory
- title: Conceptual guides
@@ -27,7 +27,7 @@
- local: examples/text_to_sql
title: Self-correcting Text-to-SQL
- local: examples/rag
- title: Master you knowledge base with agentic RAG
+ title: Master your knowledge base with agentic RAG
- local: examples/multiagents
title: Orchestrate a multi-agent system
- local: examples/web_browser
diff --git a/docs/source/en/conceptual_guides/intro_agents.mdx b/docs/source/en/conceptual_guides/intro_agents.mdx
index ca5ad31c5..ef76b103e 100644
--- a/docs/source/en/conceptual_guides/intro_agents.mdx
+++ b/docs/source/en/conceptual_guides/intro_agents.mdx
@@ -1,18 +1,3 @@
-
# Introduction to Agents
## ๐คย What are agents?
@@ -28,13 +13,14 @@ Note that with this definition, "agent" is not a discrete, 0 or 1 definition: in
See in the table below how agency can vary across systems:
-| Agency Level | Description | How that's called | Example Pattern |
-| ------------ | ------------------------------------------------------- | ----------------- | -------------------------------------------------- |
-| โโโ | LLM output has no impact on program flow | Simple Processor | `process_llm_output(llm_response)` |
-| โ โโ | LLM output determines an if/else switch | Router | `if llm_decision(): path_a() else: path_b()` |
-| โ โ โ | LLM output determines function execution | Tool Caller | `run_function(llm_chosen_tool, llm_chosen_args)` |
-| โ โ โ | LLM output controls iteration and program continuation | Multi-step Agent | `while llm_should_continue(): execute_next_step()` |
-| โ โ โ | One agentic workflow can start another agentic workflow | Multi-Agent | `if llm_trigger(): execute_agent()` |
+| Agency Level | Description | Short name | Example Code |
+| ------------ | ------------------------------------------------------ | ---------------- | -------------------------------------------------- |
+| โโโ | LLM output has no impact on program flow | Simple processor | `process_llm_output(llm_response)` |
+| โ โโ | LLM output controls an if/else switch | Router | `if llm_decision(): path_a() else: path_b()` |
+| โ โ โ | LLM output controls function execution | Tool call | `run_function(llm_chosen_tool, llm_chosen_args)` |
+| โ โ โ | LLM output controls iteration and program continuation | Multi-step Agent | `while llm_should_continue(): execute_next_step()` |
+| โ โ โ | One agentic workflow can start another agentic workflow | Multi-Agent | `if llm_trigger(): execute_agent()` |
+| โ โ โ | LLM acts in code, can define its own tools / start other agents | Code Agents | `def custom_tool(args): ...` |
The multi-step agent has this code structure:
diff --git a/docs/source/en/conceptual_guides/react.mdx b/docs/source/en/conceptual_guides/react.mdx
index b86c438e2..6358c78fd 100644
--- a/docs/source/en/conceptual_guides/react.mdx
+++ b/docs/source/en/conceptual_guides/react.mdx
@@ -1,18 +1,3 @@
-
# How do multi-step agents work?
The ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) is currently the main approach to building agents.
diff --git a/docs/source/en/examples/multiagents.mdx b/docs/source/en/examples/multiagents.mdx
index 4f41fe8e6..4e43f99f5 100644
--- a/docs/source/en/examples/multiagents.mdx
+++ b/docs/source/en/examples/multiagents.mdx
@@ -1,18 +1,3 @@
-
# Orchestrate a multi-agent system ๐ค๐ค๐ค
[[open-in-colab]]
@@ -39,19 +24,19 @@ Let's set up this system.
Run the line below to install the required dependencies:
-```
-!pip install markdownify duckduckgo-search smolagents --upgrade -q
+```py
+! pip install markdownify duckduckgo-search smolagents --upgrade -q
```
-Let's login in order to call the HF Inference API:
+Let's login to HF in order to call Inference Providers:
-```
+```py
from huggingface_hub import login
login()
```
-โก๏ธ Our agent will be powered by [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) using `HfApiModel` class that uses HF's Inference API: the Inference API allows to quickly and easily run any OS model.
+โก๏ธ Our agent will be powered by [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) using `InferenceClientModel` class that uses HF's Inference API: the Inference API allows to quickly and easily run any OS model.
_Note:_ The Inference API hosts models based on various criteria, and deployed models may be updated or replaced without prior notice. Learn more about it [here](https://huggingface.co/docs/api-inference/supported-models).
@@ -123,19 +108,19 @@ Which configuration to choose for this agent?
from smolagents import (
CodeAgent,
ToolCallingAgent,
- HfApiModel,
+ InferenceClientModel,
DuckDuckGoSearchTool,
LiteLLMModel,
)
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
web_agent = ToolCallingAgent(
tools=[DuckDuckGoSearchTool(), visit_webpage],
model=model,
max_steps=10,
- name="search",
- description="Runs web searches for you. Give it your query as an argument.",
+ name="web_search_agent",
+ description="Runs web searches for you.",
)
```
diff --git a/docs/source/en/examples/rag.mdx b/docs/source/en/examples/rag.mdx
index eb1c4c27f..212d38cb7 100644
--- a/docs/source/en/examples/rag.mdx
+++ b/docs/source/en/examples/rag.mdx
@@ -1,18 +1,3 @@
-
# Agentic RAG
[[open-in-colab]]
@@ -37,7 +22,7 @@ Run the line below to install required dependencies:
```bash
!pip install smolagents pandas langchain langchain-community sentence-transformers datasets python-dotenv rank_bm25 --upgrade -q
```
-To call the HF Inference API, you will need a valid token as your environment variable `HF_TOKEN`.
+To call Inference Providers, you will need a valid token as your environment variable `HF_TOKEN`.
We use python-dotenv to load it.
```py
from dotenv import load_dotenv
@@ -127,13 +112,13 @@ The agent will need these arguments upon initialization:
- `model`: the LLM that powers the agent.
Our `model` must be a callable that takes as input a list of messages and returns text. It also needs to accept a stop_sequences argument that indicates when to stop its generation. For convenience, we directly use the HfEngine class provided in the package to get a LLM engine that calls Hugging Face's Inference API.
->[!NOTE] To use a specific model, pass it like this: `HfApiModel("meta-llama/Llama-3.3-70B-Instruct")`. The Inference API hosts models based on various criteria, and deployed models may be updated or replaced without prior notice. Learn more about it [here](https://huggingface.co/docs/api-inference/supported-models).
+>[!NOTE] To use a specific model, pass it like this: `InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct")`. The Inference API hosts models based on various criteria, and deployed models may be updated or replaced without prior notice. Learn more about it [here](https://huggingface.co/docs/api-inference/supported-models).
```py
-from smolagents import HfApiModel, CodeAgent
+from smolagents import InferenceClientModel, CodeAgent
agent = CodeAgent(
- tools=[retriever_tool], model=HfApiModel(), max_steps=4, verbosity_level=2
+ tools=[retriever_tool], model=InferenceClientModel(), max_steps=4, verbosity_level=2
)
```
Upon initializing the CodeAgent, it has been automatically given a default system prompt that tells the LLM engine to process step-by-step and generate tool calls as code snippets, but you could replace this prompt template with your own as needed.
diff --git a/docs/source/en/examples/text_to_sql.mdx b/docs/source/en/examples/text_to_sql.mdx
index 600d8d95c..5cd93479c 100644
--- a/docs/source/en/examples/text_to_sql.mdx
+++ b/docs/source/en/examples/text_to_sql.mdx
@@ -1,18 +1,3 @@
-
# Text-to-SQL
[[open-in-colab]]
@@ -31,7 +16,7 @@ Run the line below to install required dependencies:
```bash
!pip install smolagents python-dotenv sqlalchemy --upgrade -q
```
-To call the HF Inference API, you will need a valid token as your environment variable `HF_TOKEN`.
+To call Inference Providers, you will need a valid token as your environment variable `HF_TOKEN`.
We use python-dotenv to load it.
```py
from dotenv import load_dotenv
@@ -137,14 +122,14 @@ Now let us create an agent that leverages this tool.
We use the `CodeAgent`, which is smolagentsโ main agent class: an agent that writes actions in code and can iterate on previous output according to the ReAct framework.
-The model is the LLM that powers the agent system. `HfApiModel` allows you to call LLMs using HFโs Inference API, either via Serverless or Dedicated endpoint, but you could also use any proprietary API.
+The model is the LLM that powers the agent system. `InferenceClientModel` allows you to call LLMs using HFโs Inference API, either via Serverless or Dedicated endpoint, but you could also use any proprietary API.
```py
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("meta-llama/Meta-Llama-3.1-8B-Instruct"),
+ model=InferenceClientModel(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"),
)
agent.run("Can you give me the name of the client who got the most expensive receipt?")
```
@@ -197,7 +182,7 @@ sql_engine.description = updated_description
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct"),
+ model=InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"),
)
agent.run("Which waiter got more total money from tips?")
diff --git a/docs/source/en/examples/web_browser.mdx b/docs/source/en/examples/web_browser.mdx
index fe2fc67de..1f464be9a 100644
--- a/docs/source/en/examples/web_browser.mdx
+++ b/docs/source/en/examples/web_browser.mdx
@@ -111,11 +111,11 @@ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
Now let's create our web automation agent:
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
# Initialize the model
model_id = "meta-llama/Llama-3.3-70B-Instruct" # You can change this to your preferred model
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
# Create the agent
agent = CodeAgent(
diff --git a/docs/source/en/guided_tour.mdx b/docs/source/en/guided_tour.mdx
index 5eca7fc21..01e247357 100644
--- a/docs/source/en/guided_tour.mdx
+++ b/docs/source/en/guided_tour.mdx
@@ -1,18 +1,3 @@
-
# Agents - Guided tour
[[open-in-colab]]
@@ -25,28 +10,29 @@ To initialize a minimal agent, you need at least these two arguments:
- `model`, a text-generation model to power your agent - because the agent is different from a simple LLM, it is a system that uses a LLM as its engine. You can use any of these options:
- [`TransformersModel`] takes a pre-initialized `transformers` pipeline to run inference on your local machine using `transformers`.
- - [`HfApiModel`] leverages a `huggingface_hub.InferenceClient` under the hood and supports all Inference Providers on the Hub.
+ - [`InferenceClientModel`] leverages a `huggingface_hub.InferenceClient` under the hood and supports all Inference Providers on the Hub: Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more.
- [`LiteLLMModel`] similarly lets you call 100+ different models and providers through [LiteLLM](https://docs.litellm.ai/)!
- [`AzureOpenAIServerModel`] allows you to use OpenAI models deployed in [Azure](https://azure.microsoft.com/en-us/products/ai-services/openai-service).
+ - [`AmazonBedrockServerModel`] allows you to use Amazon Bedrock in [AWS](https://aws.amazon.com/bedrock/?nc1=h_ls).
- [`MLXModel`] creates a [mlx-lm](https://pypi.org/project/mlx-lm/) pipeline to run inference on your local machine.
- `tools`, a list of `Tools` that the agent can use to solve the task. It can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`.
-Once you have these two arguments, `tools` and `model`, you can create an agent and run it. You can use any LLM you'd like, either through [Inference Providers](https://huggingface.co/blog/inference-providers), [transformers](https://github.com/huggingface/transformers/), [ollama](https://ollama.com/), [LiteLLM](https://www.litellm.ai/), [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service), or [mlx-lm](https://pypi.org/project/mlx-lm/).
+Once you have these two arguments, `tools` and `model`, you can create an agent and run it. You can use any LLM you'd like, either through [Inference Providers](https://huggingface.co/blog/inference-providers), [transformers](https://github.com/huggingface/transformers/), [ollama](https://ollama.com/), [LiteLLM](https://www.litellm.ai/), [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service), [Amazon Bedrock](https://aws.amazon.com/bedrock/?nc1=h_ls), or [mlx-lm](https://pypi.org/project/mlx-lm/).
-
+
-HF Inference API is free to use without a token, but then it will have a rate limit.
+Inference Providers need a `HF_TOKEN` to authenticate, but a free HF account already comes with included credits. Upgrade to PRO to raise your included credits.
-To access gated models or rise your rate limits with a PRO account, you need to set the environment variable `HF_TOKEN` or pass `token` variable upon initialization of `HfApiModel`. You can get your token from your [settings page](https://huggingface.co/settings/tokens)
+To access gated models or rise your rate limits with a PRO account, you need to set the environment variable `HF_TOKEN` or pass `token` variable upon initialization of `InferenceClientModel`. You can get your token from your [settings page](https://huggingface.co/settings/tokens)
```python
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = HfApiModel(model_id=model_id, token="") # You can choose to not pass any model_id to HfApiModel to use a default free model
+model = InferenceClientModel(model_id=model_id, token="") # You can choose to not pass any model_id to InferenceClientModel to use a default model
# you can also specify a particular provider e.g. provider="together" or provider="sambanova"
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
@@ -149,6 +135,76 @@ agent.run(
)
```
+
+
+
+The `AmazonBedrockServerModel` class provides native integration with Amazon Bedrock, allowing for direct API calls and comprehensive configuration.
+
+#### Basic Usage
+
+```python
+# !pip install smolagents[aws_sdk]
+from smolagents import CodeAgent, AmazonBedrockServerModel
+
+model = AmazonBedrockServerModel(model_id="anthropic.claude-3-sonnet-20240229-v1:0")
+agent = CodeAgent(tools=[], model=model, add_base_tools=True)
+
+agent.run(
+ "Could you give me the 118th number in the Fibonacci sequence?",
+)
+```
+
+#### Advanced Configuration
+
+```python
+import boto3
+from smolagents import AmazonBedrockServerModel
+
+# Create a custom Bedrock client
+bedrock_client = boto3.client(
+ 'bedrock-runtime',
+ region_name='us-east-1',
+ aws_access_key_id='YOUR_ACCESS_KEY',
+ aws_secret_access_key='YOUR_SECRET_KEY'
+)
+
+additional_api_config = {
+ "inferenceConfig": {
+ "maxTokens": 3000
+ },
+ "guardrailConfig": {
+ "guardrailIdentifier": "identify1",
+ "guardrailVersion": 'v1'
+ },
+}
+
+# Initialize with comprehensive configuration
+model = AmazonBedrockServerModel(
+ model_id="us.amazon.nova-pro-v1:0",
+ client=bedrock_client, # Use custom client
+ **additional_api_config
+)
+
+agent = CodeAgent(tools=[], model=model, add_base_tools=True)
+
+agent.run(
+ "Could you give me the 118th number in the Fibonacci sequence?",
+)
+```
+
+#### Using LiteLLMModel
+
+Alternatively, you can use `LiteLLMModel` with Bedrock models:
+
+```python
+from smolagents import LiteLLMModel, CodeAgent
+
+model = LiteLLMModel(model_name="bedrock/anthropic.claude-3-sonnet-20240229-v1:0")
+agent = CodeAgent(tools=[], model=model)
+
+agent.run("Explain the concept of quantum computing")
+```
+
@@ -176,17 +232,22 @@ The Python interpreter also doesn't allow imports by default outside of a safe l
You can authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`CodeAgent`]:
```py
-model = HfApiModel()
+model = InferenceClientModel()
agent = CodeAgent(tools=[], model=model, additional_authorized_imports=['requests', 'bs4'])
agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
```
+Additionally, as an extra security layer, access to submodule is forbidden by default, unless explicitly authorized within the import list.
+For instance, to access the `numpy.random` submodule, you need to add `'numpy.random'` to the `additional_authorized_imports` list.
+This could also be authorized by using `numpy.*`, which will allow `numpy` as well as any subpackage like `numpy.random` and its own subpackages.
+
> [!WARNING]
> The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports!
The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent.
-You can also use [E2B code executor](https://e2b.dev/docs#what-is-e2-b) instead of a local Python interpreter by first [setting the `E2B_API_KEY` environment variable](https://e2b.dev/dashboard?tab=keys) and then passing `use_e2b_executor=True` upon agent initialization.
+You can also use [E2B code executor](https://e2b.dev/docs#what-is-e2-b) or Docker instead of a local Python interpreter. For E2B, first [set the `E2B_API_KEY` environment variable](https://e2b.dev/dashboard?tab=keys) and then pass `executor_type="e2b"` upon agent initialization. For Docker, pass `executor_type="docker"` during initialization.
+
> [!TIP]
> Learn more about code execution [in this tutorial](tutorials/secure_code_execution).
@@ -220,7 +281,7 @@ When the agent is initialized, the tool attributes are used to generate a tool d
### Default toolbox
-`smolagents` comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`:
+`smolagents` comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools=True`:
- **DuckDuckGo web search***: performs a web search using DuckDuckGo browser.
- **Python code interpreter**: runs your LLM generated Python code in a secure environment. This tool will only be added to [`ToolCallingAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code
@@ -279,6 +340,7 @@ The function needs:
- A clear name. The name should be descriptive enough of what this tool does to help the LLM brain powering the agent. Since this tool returns the model with the most downloads for a task, let's name it `model_download_tool`.
- Type hints on both inputs and output
- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint). Same as for the tool name, this description is an instruction manual for the LLM powering you agent, so do not neglect it.
+
All these elements will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
> [!TIP]
@@ -312,8 +374,8 @@ All these attributes will be automatically baked into the agent's system prompt
Then you can directly initialize your agent:
```py
-from smolagents import CodeAgent, HfApiModel
-agent = CodeAgent(tools=[model_download_tool], model=HfApiModel())
+from smolagents import CodeAgent, InferenceClientModel
+agent = CodeAgent(tools=[model_download_tool], model=InferenceClientModel())
agent.run(
"Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
)
@@ -326,7 +388,7 @@ You get the following logs:
โ Can you give me the name of the model that has the most downloads in the 'text-to-video' โ
โ task on the Hugging Face Hub? โ
โ โ
-โฐโ HfApiModel - Qwen/Qwen2.5-Coder-32B-Instruct โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
+โฐโ InferenceClientModel - Qwen/Qwen2.5-Coder-32B-Instruct โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ Step 0 โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โญโ Executing this code: โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
โ 1 model_name = model_download_tool(task="text-to-video") โ
@@ -364,9 +426,9 @@ Then you can pass this managed agent in the parameter managed_agents upon initia
Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]:
```py
-from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool
+from smolagents import CodeAgent, InferenceClientModel, DuckDuckGoSearchTool
-model = HfApiModel()
+model = InferenceClientModel()
web_agent = CodeAgent(
tools=[DuckDuckGoSearchTool()],
@@ -394,14 +456,14 @@ You can use `GradioUI` to interactively submit tasks to your agent and observe i
from smolagents import (
load_tool,
CodeAgent,
- HfApiModel,
+ InferenceClientModel,
GradioUI
)
# Import tool from Hub
image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True)
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
# Initialize the agent with the image generation tool
agent = CodeAgent(tools=[image_generation_tool], model=model)
@@ -414,6 +476,9 @@ The `reset=False` flag means the agent's memory is not flushed before launching
You can also use this `reset=False` argument to keep the conversation going in any other agentic application.
+In gradio UIs, if you want to allow users to interrupt a running agent, you could do this with a button that triggers method `agent.interrupt()`.
+This will stop the agent at the end of its current step, then raise an error.
+
## Next steps
Finally, when you've configured your agent to your needs, you can share it to the Hub!
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 14f80ff5b..97cc905fc 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -1,18 +1,3 @@
-
-
# `smolagents`
@@ -25,7 +10,7 @@ This library offers:
โจ **Simplicity**: the logic for agents fits in ~thousand lines of code. We kept abstractions to their minimal shape above raw code!
-๐ **Support for any LLM**: it supports models hosted on the Hub loaded in their `transformers` version or through our inference API and Inference providers, but also models from OpenAI, Anthropic... it's really easy to power an agent with any LLM.
+๐ **Support for any LLM**: it supports models hosted on the Hub loaded in their `transformers` version or through [Inference providers](https://huggingface.co/docs/inference-providers/index): Cerebras, Cohere, Fal, Fireworks, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, etc. It also supports models from OpenAI, Anthropic... it's really easy to power an agent with any LLM.
๐งโ๐ป **First-class support for Code Agents**, i.e. agents that write their actions in code (as opposed to "agents being used to write code"), [read more here](tutorials/secure_code_execution).
diff --git a/docs/source/en/reference/agents.mdx b/docs/source/en/reference/agents.mdx
index a6f57183e..d8f975e34 100644
--- a/docs/source/en/reference/agents.mdx
+++ b/docs/source/en/reference/agents.mdx
@@ -1,18 +1,3 @@
-
# Agents
diff --git a/docs/source/en/reference/models.mdx b/docs/source/en/reference/models.mdx
index 2a7f8f45d..59816c60e 100644
--- a/docs/source/en/reference/models.mdx
+++ b/docs/source/en/reference/models.mdx
@@ -1,18 +1,3 @@
-
# Models
@@ -27,13 +12,17 @@ contains the API docs for the underlying classes.
## Models
+### Your custom Model
+
You're free to create and use your own models to power your agent.
-You could use any `model` callable for your agent, as long as:
-1. It follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
-2. It stops generating outputs *before* the sequences passed in the argument `stop_sequences`
+You could subclass the base `Model` class to create a model for your agent.
+The main criteria is to subclass the `generate` method, with these two criteria:
+1. It follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns an object with a `.content` attribute.
+2. It stops generating outputs at the sequences passed in the argument `stop_sequences`.
-For defining your LLM, you can make a `custom_model` method which accepts a list of [messages](./chat_templating) and returns an object with a .content attribute containing the text. This callable also needs to accept a `stop_sequences` argument that indicates when to stop generating.
+For defining your LLM, you can make a `CustomModel` class that inherits from the base `Model` class.
+It should have a generate method that takes a list of [messages](./chat_templating) and returns an object with a .content attribute containing the text. The `generate` method also needs to accept a `stop_sequences` argument that indicates when to stop generating.
```python
from huggingface_hub import login, InferenceClient
@@ -44,13 +33,16 @@ model_id = "meta-llama/Llama-3.3-70B-Instruct"
client = InferenceClient(model=model_id)
-def custom_model(messages, stop_sequences=["Task"]):
- response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000)
- answer = response.choices[0].message
- return answer
+class CustomModel(Model):
+ def generate(messages, stop_sequences=["Task"]):
+ response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1024)
+ answer = response.choices[0].message
+ return answer
+
+custom_model = CustomModel()
```
-Additionally, `custom_model` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to model, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
+Additionally, `generate` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to model, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
### TransformersModel
@@ -72,24 +64,24 @@ print(model([{"role": "user", "content": [{"type": "text", "text": "Ok!"}]}], st
[[autodoc]] TransformersModel
-### HfApiModel
+### InferenceClientModel
-The `HfApiModel` wraps huggingface_hub's [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/guides/inference) for the execution of the LLM. It supports both HF's own [Inference API](https://huggingface.co/docs/api-inference/index) as well as all [Inference Providers](https://huggingface.co/blog/inference-providers) available on the Hub.
+The `HfApiModel` wraps huggingface_hub's [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/guides/inference) for the execution of the LLM. It supports all [Inference Providers](https://huggingface.co/docs/inference-providers/index) available on the Hub: Cerebras, Cohere, Fal, Fireworks, HF-Inference, Hyperbolic, Nebius, Novita, Replicate, SambaNova, Together, and more.
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
messages = [
{"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]}
]
-model = HfApiModel()
+model = InferenceClientModel(provider="novita")
print(model(messages))
```
```text
>>> Of course! If you change your mind, feel free to reach out. Take care!
```
-[[autodoc]] HfApiModel
+[[autodoc]] InferenceClientModel
### LiteLLMModel
@@ -103,12 +95,46 @@ messages = [
{"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]}
]
-model = LiteLLMModel("anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10)
+model = LiteLLMModel(model_id="anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10)
print(model(messages))
```
[[autodoc]] LiteLLMModel
+### LiteLLMRouterModel
+
+The `LiteLLMRouterModel` is a wrapper around the [LiteLLM Router](https://docs.litellm.ai/docs/routing) that leverages
+advanced routing strategies: load-balancing across multiple deployments, prioritizing critical requests via queueing,
+and implementing basic reliability measures such as cooldowns, fallbacks, and exponential backoff retries.
+
+```python
+from smolagents import LiteLLMRouterModel
+
+messages = [
+ {"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]}
+]
+
+model = LiteLLMRouterModel(
+ model_id="llama-3.3-70b",
+ model_list=[
+ {
+ "model_name": "llama-3.3-70b",
+ "litellm_params": {"model": "groq/llama-3.3-70b", "api_key": os.getenv("GROQ_API_KEY")},
+ },
+ {
+ "model_name": "llama-3.3-70b",
+ "litellm_params": {"model": "cerebras/llama-3.3-70b", "api_key": os.getenv("CEREBRAS_API_KEY")},
+ },
+ ],
+ client_kwargs={
+ "routing_strategy": "simple-shuffle",
+ },
+)
+print(model(messages))
+```
+
+[[autodoc]] LiteLLMRouterModel
+
### OpenAIServerModel
This class lets you call any OpenAIServer compatible model.
@@ -149,6 +175,24 @@ model = AzureOpenAIServerModel(
[[autodoc]] AzureOpenAIServerModel
+### AmazonBedrockServerModel
+
+`AmazonBedrockServerModel` helps you connect to Amazon Bedrock and run your agent with any available models.
+
+Below is an example setup. This class also offers additional options for customization.
+
+```py
+import os
+
+from smolagents import AmazonBedrockServerModel
+
+model = AmazonBedrockServerModel(
+ model_id = os.environ.get("AMAZON_BEDROCK_MODEL_ID"),
+)
+```
+
+[[autodoc]] AmazonBedrockServerModel
+
### MLXModel
@@ -167,3 +211,20 @@ print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"]))
> You must have `mlx-lm` installed on your machine. Please run `pip install smolagents[mlx-lm]` if it's not the case.
[[autodoc]] MLXModel
+
+### VLLMModel
+
+Model to use [vLLM](https://docs.vllm.ai/) for fast LLM inference and serving.
+
+```python
+from smolagents import VLLMModel
+
+model = VLLMModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct")
+
+print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"]))
+```
+
+> [!TIP]
+> You must have `vllm` installed on your machine. Please run `pip install smolagents[vllm]` if it's not the case.
+
+[[autodoc]] VLLMModel
diff --git a/docs/source/en/reference/tools.mdx b/docs/source/en/reference/tools.mdx
index 68c70b897..a5d217bb8 100644
--- a/docs/source/en/reference/tools.mdx
+++ b/docs/source/en/reference/tools.mdx
@@ -1,18 +1,3 @@
-
# Tools
@@ -77,6 +62,10 @@ contains the API docs for the underlying classes.
[[autodoc]] ToolCollection
+## MCP Client
+
+[[autodoc]] smolagents.mcp_client.MCPClient
+
## Agent Types
Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return
diff --git a/docs/source/en/tutorials/building_good_agents.mdx b/docs/source/en/tutorials/building_good_agents.mdx
index 8c17de1af..53bda8f92 100644
--- a/docs/source/en/tutorials/building_good_agents.mdx
+++ b/docs/source/en/tutorials/building_good_agents.mdx
@@ -1,18 +1,3 @@
-
# Building good agents
[[open-in-colab]]
@@ -43,7 +28,7 @@ This leads to a few takeaways:
### Improve the information flow to the LLM engine
-Remember that your LLM engine is like an *intelligent* robot, tapped into a room with the only communication with the outside world being notes passed under a door.
+Remember that your LLM engine is like an *intelligent* robot, trapped into a room with the only communication with the outside world being notes passed under a door.
It won't know of anything that happened if you don't explicitly put that into its prompt.
@@ -120,11 +105,11 @@ In general, to ease the load on your LLM, the good question to ask yourself is:
To pass some additional objects to your agent beyond the simple string describing the task, you can use the `additional_args` argument to pass any type of object:
```py
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
model_id = "meta-llama/Llama-3.3-70B-Instruct"
-agent = CodeAgent(tools=[], model=HfApiModel(model_id=model_id), add_base_tools=True)
+agent = CodeAgent(tools=[], model=InferenceClientModel(model_id=model_id), add_base_tools=True)
agent.run(
"Why does Mike not know many people in New York?",
@@ -210,13 +195,153 @@ In the end you have to return a final answer using the `final_answer` tool.
Here are a few examples using notional tools:
---
-{examples}
+Task: "Generate an image of the oldest person in this document."
-Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+Code:
+```py
+answer = document_qa(document=document, question="Who is the oldest person mentioned?")
+print(answer)
+```
+Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
+
+Thought: I will now generate an image showcasing the oldest person.
+Code:
+```py
+image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
+final_answer(image)
+```
+
+---
+Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
+
+Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
+Code:
+```py
+result = 5 + 3 + 1294.678
+final_answer(result)
+```
+
+---
+Task:
+"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
+You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
+{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
+
+Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+Code:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+final_answer(f"The answer is {answer}")
+```
+
+---
+Task:
+In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
+What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
-{{tool_descriptions}}
+Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
+Code:
+```py
+pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
+print(pages)
+```
+Observation:
+No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
+
+Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
+Code:
+```py
+pages = search(query="1979 interview Stanislaus Ulam")
+print(pages)
+```
+Observation:
+Found 6 pages:
+[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
+
+[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
+
+(truncated)
+
+Thought: I will read the first 2 pages to know more.
+Code:
+```py
+for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
+ whole_page = visit_webpage(url)
+ print(whole_page)
+ print("\n" + "="*80 + "\n") # Print separator between pages
+```
+Observation:
+Manhattan Project Locations:
+Los Alamos, NM
+Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
+(truncated)
+
+Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
+Code:
+```py
+final_answer("diminished")
+```
-{{managed_agents_descriptions}}
+---
+Task: "Which city has the highest population: Guangzhou or Shanghai?"
+
+Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
+Code:
+```py
+for city in ["Guangzhou", "Shanghai"]:
+ print(f"Population {city}:", search(f"{city} population")
+```
+Observation:
+Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
+Population Shanghai: '26 million (2019)'
+
+Thought: Now I know that Shanghai has the highest population.
+Code:
+```py
+final_answer("Shanghai")
+```
+
+---
+Task: "What is the current age of the pope, raised to the power 0.36?"
+
+Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
+Code:
+```py
+pope_age_wiki = wiki(query="current pope age")
+print("Pope age as per wikipedia:", pope_age_wiki)
+pope_age_search = web_search(query="current pope age")
+print("Pope age as per google search:", pope_age_search)
+```
+Observation:
+Pope age: "The pope Francis is currently 88 years old."
+
+Thought: I know that the pope is 88 years old. Let's compute the result using python code.
+Code:
+```py
+pope_current_age = 88 ** 0.36
+final_answer(pope_current_age)
+```
+
+Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+ Takes inputs: {{tool.inputs}}
+ Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+
+{%- if managed_agents and managed_agents.values() | list %}
+You can also give tasks to team members.
+Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+Given that this team member is a real human, you should be very verbose in your task.
+Here is a list of the team members that you can call:
+{%- for agent in managed_agents.values() %}
+- {{ agent.name }}: {{ agent.description }}
+{%- endfor %}
+{%- else %}
+{%- endif %}
Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```' sequence, else you will fail.
@@ -225,7 +350,7 @@ Here are the rules you should always follow to solve your task:
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
-7. Never create any notional variables in our code, as having these in your logs might derail you from the true variables.
+7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
@@ -233,12 +358,30 @@ Here are the rules you should always follow to solve your task:
Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
```
-As you can see, there are placeholders like `"{{tool_descriptions}}"`: these will be used upon agent initialization to insert certain automatically generated descriptions of tools or managed agents.
-
-So while you can overwrite this system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter, your new system prompt must contain the following placeholders:
-- `"{{tool_descriptions}}"` to insert tool descriptions.
-- `"{{managed_agents_description}}"` to insert the description for managed agents if there are any.
-- For `CodeAgent` only: `"{{authorized_imports}}"` to insert the list of authorized imports.
+As you can see, there are placeholders like `"{{ tool.description }}"`: these will be used upon agent initialization to insert certain automatically generated descriptions of tools or managed agents.
+
+So while you can overwrite this system prompt template by passing your custom prompt as an argument to the `system_prompt` parameter, your new system prompt can contain the following placeholders:
+- To insert tool descriptions:
+ ```
+ {%- for tool in tools.values() %}
+ - {{ tool.name }}: {{ tool.description }}
+ Takes inputs: {{tool.inputs}}
+ Returns an output of type: {{tool.output_type}}
+ {%- endfor %}
+ ```
+- To insert the descriptions for managed agents if there are any:
+ ```
+ {%- if managed_agents and managed_agents.values() | list %}
+ You can also give tasks to team members.
+ Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+ Given that this team member is a real human, you should be very verbose in your task.
+ Here is a list of the team members that you can call:
+ {%- for agent in managed_agents.values() %}
+ - {{ agent.name }}: {{ agent.description }}
+ {%- endfor %}
+ {%- endif %}
+ ```
+- For `CodeAgent` only, to insert the list of authorized imports: `"{{authorized_imports}}"`
Then you can change the system prompt as follows:
@@ -254,7 +397,7 @@ This also works with the [`ToolCallingAgent`].
We provide a model for a supplementary planning step, that an agent can run regularly in-between normal action steps. In this step, there is no tool call, the LLM is simply asked to update a list of facts it knows and to reflect on what steps it should take next based on those facts.
```py
-from smolagents import load_tool, CodeAgent, HfApiModel, DuckDuckGoSearchTool
+from smolagents import load_tool, CodeAgent, InferenceClientModel, DuckDuckGoSearchTool
from dotenv import load_dotenv
load_dotenv()
@@ -266,7 +409,7 @@ search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(
tools=[search_tool, image_generation_tool],
- model=HfApiModel("Qwen/Qwen2.5-72B-Instruct"),
+ model=InferenceClientModel(model_id="Qwen/Qwen2.5-72B-Instruct"),
planning_interval=3 # This is where you activate planning!
)
diff --git a/docs/source/en/tutorials/inspect_runs.mdx b/docs/source/en/tutorials/inspect_runs.mdx
index 4ade8427b..333db728b 100644
--- a/docs/source/en/tutorials/inspect_runs.mdx
+++ b/docs/source/en/tutorials/inspect_runs.mdx
@@ -1,18 +1,3 @@
-
# Inspecting runs with OpenTelemetry
[[open-in-colab]]
@@ -71,10 +56,10 @@ from smolagents import (
ToolCallingAgent,
DuckDuckGoSearchTool,
VisitWebpageTool,
- HfApiModel,
+ InferenceClientModel,
)
-model = HfApiModel()
+model = InferenceClientModel()
search_agent = ToolCallingAgent(
tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
@@ -160,10 +145,10 @@ from smolagents import (
ToolCallingAgent,
DuckDuckGoSearchTool,
VisitWebpageTool,
- HfApiModel,
+ InferenceClientModel,
)
-model = HfApiModel(
+model = InferenceClientModel(
model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
)
diff --git a/docs/source/en/tutorials/memory.mdx b/docs/source/en/tutorials/memory.mdx
index 0732d9596..df982da82 100644
--- a/docs/source/en/tutorials/memory.mdx
+++ b/docs/source/en/tutorials/memory.mdx
@@ -1,18 +1,3 @@
-
# ๐ Manage your agent's memory
[[open-in-colab]]
@@ -30,9 +15,9 @@ You can also use `agent.replay()`, as follows:
After the agent has run:
```py
-from smolagents import HfApiModel, CodeAgent
+from smolagents import InferenceClientModel, CodeAgent
-agent = CodeAgent(tools=[], model=HfApiModel(), verbosity_level=0)
+agent = CodeAgent(tools=[], model=InferenceClientModel(), verbosity_level=0)
result = agent.run("What's the 20th Fibonacci number?")
```
@@ -73,7 +58,7 @@ You can also use step callbacks to dynamically change the agent's memory.
Step callbacks can access the `agent` itself in their arguments, so they can access any memory step as highlighted above, and change it if needed. For instance, let's say you are observing screenshots of each step performed by a web browser agent. You want to log the newest screenshot, and remove the images from ancient steps to save on token costs.
-You culd run something like the following.
+You could run something like the following.
_Note: this code is incomplete, some imports and object definitions have been removed for the sake of concision, visit [the original script](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py) to get the full working code._
```py
@@ -115,9 +100,10 @@ This can be useful in case you have tool calls that take days: you can just run
This will also let you update the memory on each step.
```py
-from smolagents import HfApiModel, CodeAgent, ActionStep, TaskStep
+from smolagents import InferenceClientModel, CodeAgent, ActionStep, TaskStep
-agent = CodeAgent(tools=[], model=HfApiModel(), verbosity_level=1)
+agent = CodeAgent(tools=[], model=InferenceClientModel(), verbosity_level=1)
+agent.python_executor.send_tools({**agent.tools})
print(agent.memory.system_prompt)
task = "What is the 20th Fibonacci number?"
@@ -145,4 +131,4 @@ while final_answer is None and step_number <= 10:
# agent.memory.steps[-1] = ...
print("The final answer is:", final_answer)
-```
\ No newline at end of file
+```
diff --git a/docs/source/en/tutorials/secure_code_execution.mdx b/docs/source/en/tutorials/secure_code_execution.mdx
index daa8ee900..8716f63c6 100644
--- a/docs/source/en/tutorials/secure_code_execution.mdx
+++ b/docs/source/en/tutorials/secure_code_execution.mdx
@@ -1,18 +1,3 @@
-
# Secure code execution
[[open-in-colab]]
@@ -24,12 +9,12 @@ rendered properly in your Markdown viewer.
[Multiple](https://huggingface.co/papers/2402.01030) [research](https://huggingface.co/papers/2411.01747) [papers](https://huggingface.co/papers/2401.00812) have shown that having the LLM write its actions (the tool calls) in code is much better than the current standard format for tool calling, which is across the industry different shades of "writing actions as a JSON of tools names and arguments to use".
-Why is code better? Well, because we crafted our code languages specifically to be great at expressing actions performed by a computer. If JSON snippets was a better way, this package would have been written in JSON snippets and the devil would be laughing at us.
+Why is code better? Well, because we crafted our code languages specifically to be great at expressing actions performed by a computer. If JSON snippets were a better way, this package would have been written in JSON snippets and the devil would be laughing at us.
Code is just a better way to express actions on a computer. It has better:
- **Composability:** could you nest JSON actions within each other, or define a set of JSON actions to re-use later, the same way you could just define a python function?
- **Object management:** how do you store the output of an action like `generate_image` in JSON?
-- **Generality:** code is built to express simply anything you can do have a computer do.
+- **Generality:** code is built to express simply anything you can have a computer do.
- **Representation in LLM training corpus:** why not leverage this benediction of the sky that plenty of quality actions have already been included in LLM training corpus?
This is illustrated on the figure below, taken from [Executable Code Actions Elicit Better LLM Agents](https://huggingface.co/papers/2402.01030).
@@ -38,45 +23,392 @@ This is illustrated on the figure below, taken from [Executable Code Actions Eli
This is why we put emphasis on proposing code agents, in this case python agents, which meant putting higher effort on building secure python interpreters.
-### Local python interpreter
+### Local code execution??
By default, the `CodeAgent` runs LLM-generated code in your environment.
-This execution is not done by the vanilla Python interpreter: we've re-built a more secure `LocalPythonInterpreter` from the ground up.
-This interpreter is designed for security by:
- - Restricting the imports to a list explicitly passed by the user
- - Capping the number of operations to prevent infinite loops and resource bloating.
- - Will not perform any operation that's not pre-defined.
-We've used this on many use cases, without ever observing any damage to the environment.
+This is inherently risky, LLM-generated code could be harmful to your environment.
+
+Malicious code execution can occur in several ways:
+- **Plain LLM error:** LLMs are still far from perfect and may unintentionally generate harmful commands while attempting to be helpful. While this risk is low, instances have been observed where an LLM attempted to execute potentially dangerous code.
+- **Supply chain attack:** Running an untrusted or compromised LLM could expose a system to harmful code generation. While this risk is extremely low when using well-known models on secure inference infrastructure, it remains a theoretical possibility.
+- **Prompt injection:** an agent browsing the web could arrive on a malicious website that contains harmful instructions, thus injecting an attack into the agent's memory
+- **Exploitation of publicly accessible agents:** Agents exposed to the public can be misused by malicious actors to execute harmful code. Attackers may craft adversarial inputs to exploit the agent's execution capabilities, leading to unintended consequences.
+Once malicious code is executed, whether accidentally or intentionally, it can damage the file system, exploit local or cloud-based resources, abuse API services, and even compromise network security.
+
+One could argue that on the [spectrum of agency](../conceptual_guides/intro_agents), code agents give much higher agency to the LLM on your system than other less agentic setups: this goes hand-in-hand with higher risk.
+
+So you need to be very mindful of security.
+
+To improve safety, we propose a range of measures that propose elevated levels of security, at a higher setup cost.
+
+We advise you to keep in mind that no solution will be 100% safe.
+
+
+
+### Our local Python executor
+
+To add a first layer of security, code execution in `smolagents` is not performed by the vanilla Python interpreter.
+We have re-built a more secure `LocalPythonExecutor` from the ground up.
+
+To be precise, this interpreter works by loading the Abstract Syntax Tree (AST) from your Code and executes it operation by operation, making sure to always follow certain rules:
+- By default, imports are disallowed unless they have been explicitly added to an authorization list by the user.
+- Furthermore, access to submodules is disabled by default, and each must be explicitly authorized in the import list as well, or you can pass for instance `numpy.*` to allow both `numpy` and all its subpackags, like `numpy.random` or `numpy.a.b`.
+ - Note that some seemingly innocuous packages like `random` can give access to potentially harmful submodules, as in `random._os`.
+- The total count of elementary operations processed is capped to prevent infinite loops and resource bloating.
+- Any operation that has not been explicitly defined in our custom interpreter will raise an error.
+
+You could try these safeguards as follows:
+
+```py
+from smolagents.local_python_executor import LocalPythonExecutor
+
+# Set up custom executor, authorize package "numpy"
+custom_executor = LocalPythonExecutor(["numpy"])
+
+# Utilisty for pretty printing errors
+def run_capture_exception(command: str):
+ try:
+ custom_executor(harmful_command)
+ except Exception as e:
+ print("ERROR:\n", e)
+
+# Undefined command just do not work
+harmful_command="!echo Bad command"
+run_capture_exception(harmful_command)
+# >>> ERROR: invalid syntax (, line 1)
+
+
+# Imports like os will not be performed unless explicitly added to `additional_authorized_imports`
+harmful_command="import os; exit_code = os.system("echo Bad command")"
+run_capture_exception(harmful_command)
+# >>> ERROR: Code execution failed at line 'import os' due to: InterpreterError: Import of os is not allowed. Authorized imports are: ['statistics', 'numpy', 'itertools', 'time', 'queue', 'collections', 'math', 'random', 're', 'datetime', 'stat', 'unicodedata']
+
+# Even in authorized imports, potentially harmful packages will not be imported
+harmful_command="import random; random._os.system('echo Bad command')"
+run_capture_exception(harmful_command)
+# >>> ERROR: Code execution failed at line 'random._os.system('echo Bad command')' due to: InterpreterError: Forbidden access to module: os
+
+# Infinite loop are interrupted after N operations
+harmful_command="""
+while True:
+ pass
+"""
+run_capture_exception(harmful_command)
+# >>> ERROR: Code execution failed at line 'while True: pass' due to: InterpreterError: Maximum number of 1000000 iterations in While loop exceeded
+```
+
+These safeguards make out interpreter is safer.
+We have used it on a diversity of use cases, without ever observing any damage to the environment.
+
+> [!WARNING]
+> It's important to understand that no local python sandbox can ever be completely secure. While our interpreter provides significant safety improvements over the standard Python interpreter, it is still possible for a determined attacker or a fine-tuned malicious LLM to find vulnerabilities and potentially harm your environment.
+>
+> For example, if you've allowed packages like `Pillow` to process images, the LLM could generate code that creates thousands of large image files to fill your hard drive. Other advanced escape techniques might exploit deeper vulnerabilities in authorized packages.
+>
+> Running LLM-generated code in your local environment always carries some inherent risk. The only way to run LLM-generated code with truly robust security isolation is to use remote execution options like E2B or Docker, as detailed below.
+
+The risk of a malicious attack is low when using well-known LLMs from trusted inference providers, but it is not zero.
+For high-security applications or when using less trusted models, you should consider using a remote execution sandbox.
+
+## Sandbox approaches for secure code execution
+
+When working with AI agents that execute code, security is paramount. There are two main approaches to sandboxing code execution in smolagents, each with different security properties and capabilities:
+
+
+
+
+1. **Running individual code snippets in a sandbox**: This approach (left side of diagram) only executes the agent-generated Python code snippets in a sandbox while keeping the rest of the agentic system in your local environment. It's simpler to set up using `executor_type="e2b"` or `executor_type="docker"`, but it doesn't support multi-agents and still requires passing state data between your environment and the sandbox.
+
+2. **Running the entire agentic system in a sandbox**: This approach (right side of diagram) runs the entire agentic system, including the agent, model, and tools, within a sandbox environment. This provides better isolation but requires more manual setup and may require passing sensitive credentials (like API keys) to the sandbox environment.
+
+This guide describes how to set up and use both types of sandbox approaches for your agent applications.
+
+### E2B setup
+
+#### Installation
+
+1. Create an E2B account at [e2b.dev](https://e2b.dev)
+2. Install the required packages:
+```bash
+pip install 'smolagents[e2b]'
+```
+
+#### Running your agent in E2B: quick start
+
+We provide a simple way to use an E2B Sandbox: simply add `executor_type="e2b"` to the agent initialization, as follows:
+
+```py
+from smolagents import InferenceClientModel, CodeAgent
+
+agent = CodeAgent(model=InferenceClientModel(), tools=[], executor_type="e2b")
+
+agent.run("Can you give me the 100th Fibonacci number?")
+```
+
+This solution send the agent state to the server at the start of each `agent.run()`.
+Then the models are called from the local environment, but the generated code will be sent to the sandbox for execution, and only the output will be returned.
+
+This is illustrated in the figure below.
+
+
+
+
+
-However this solution is not watertight: one could imagine occasions where LLMs fine-tuned for malignant actions could still hurt your environment. For instance if you've allowed an innocuous package like `Pillow` to process images, the LLM could generate thousands of saves of images to bloat your hard drive.
-It's certainly not likely if you've chosen the LLM engine yourself, but it could happen.
+However, since any call to a [managed agent](../examples/multiagents) would require model calls, since we do not transfer secrets to the remote sandbox, the model call would lack credentials.
+Hence this solution does not work (yet) with more complicated multi-agent setups.
-So if you want to be extra cautious, you can use the remote code execution option described below.
+#### Running your agent in E2B: multi-agents
-### E2B code executor
+To use multi-agents in an E2B sandbox, you need to run your agents completely from within E2B.
-For maximum security, you can use our integration with E2B to run code in a sandboxed environment. This is a remote execution service that runs your code in an isolated container, making it impossible for the code to affect your local environment.
+Here is how to do it:
-For this, you will need to setup your E2B account and set your `E2B_API_KEY` in your environment variables. Head to [E2B's quickstart documentation](https://e2b.dev/docs/quickstart) for more information.
+```python
+from e2b_code_interpreter import Sandbox
+import os
-Then you can install it with `pip install "smolagents[e2b]"`.
+# Create the sandbox
+sandbox = Sandbox()
-Now you're set!
+# Install required packages
+sandbox.commands.run("pip install smolagents")
-To set the code executor to E2B, simply pass the flag `use_e2b_executor=True` when initializing your `CodeAgent`.
-Note that you should add all the tool's dependencies in `additional_authorized_imports`, so that the executor installs them.
+def run_code_raise_errors(sandbox, code: str, verbose: bool = False) -> str:
+ execution = sandbox.run_code(
+ code,
+ envs={'HF_TOKEN': os.getenv('HF_TOKEN')}
+ )
+ if execution.error:
+ execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
+ logs = execution_logs
+ logs += execution.error.traceback
+ raise ValueError(logs)
+ return "\n".join([str(log) for log in execution.logs.stdout])
+# Define your agent application
+agent_code = """
+import os
+from smolagents import CodeAgent, InferenceClientModel
+
+# Initialize the agents
+agent = CodeAgent(
+ model=InferenceClientModel(token=os.getenv("HF_TOKEN"), provider="together"),
+ tools=[],
+ name="coder_agent",
+ description="This agent takes care of your difficult algorithmic problems using code."
+)
+
+manager_agent = CodeAgent(
+ model=InferenceClientModel(token=os.getenv("HF_TOKEN"), provider="together"),
+ tools=[],
+ managed_agents=[agent],
+)
+
+# Run the agent
+response = manager_agent.run("What's the 20th Fibonacci number?")
+print(response)
+"""
+
+# Run the agent code in the sandbox
+execution_logs = run_code_raise_errors(sandbox, agent_code)
+print(execution_logs)
+```
+
+### Docker setup
+
+#### Installation
+
+1. [Install Docker on your system](https://docs.docker.com/get-started/get-docker/)
+2. Install the required packages:
+```bash
+pip install 'smolagents[docker]'
+```
+
+#### Running your agent in E2B: quick start
+
+Similar to the E2B Sandbox above, to quickly get started with Docker, simply add `executor_type="docker"` to the agent initialization, like:
```py
-from smolagents import CodeAgent, VisitWebpageTool, HfApiModel
+from smolagents import InferenceClientModel, CodeAgent
+
+agent = CodeAgent(model=InferenceClientModel(), tools=[], executor_type="docker")
+
+agent.run("Can you give me the 100th Fibonacci number?")
+```
+
+#### Advanced docker usage
+
+If you want to run multi-agent systems in Docker, you'll need to setup a custom interpreter in a sandbox.
+
+Here is how to setup the a Dockerfile:
+
+```dockerfile
+FROM python:3.10-bullseye
+
+# Install build dependencies
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ build-essential \
+ python3-dev && \
+ pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir smolagents && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Run with limited privileges
+USER nobody
+
+# Default command
+CMD ["python", "-c", "print('Container ready')"]
+```
+
+Create a sandbox manager to run code:
+
+```python
+import docker
+import os
+from typing import Optional
+
+class DockerSandbox:
+ def __init__(self):
+ self.client = docker.from_env()
+ self.container = None
+
+ def create_container(self):
+ try:
+ image, build_logs = self.client.images.build(
+ path=".",
+ tag="agent-sandbox",
+ rm=True,
+ forcerm=True,
+ buildargs={},
+ # decode=True
+ )
+ except docker.errors.BuildError as e:
+ print("Build error logs:")
+ for log in e.build_log:
+ if 'stream' in log:
+ print(log['stream'].strip())
+ raise
+
+ # Create container with security constraints and proper logging
+ self.container = self.client.containers.run(
+ "agent-sandbox",
+ command="tail -f /dev/null", # Keep container running
+ detach=True,
+ tty=True,
+ mem_limit="512m",
+ cpu_quota=50000,
+ pids_limit=100,
+ security_opt=["no-new-privileges"],
+ cap_drop=["ALL"],
+ environment={
+ "HF_TOKEN": os.getenv("HF_TOKEN")
+ },
+ )
+
+ def run_code(self, code: str) -> Optional[str]:
+ if not self.container:
+ self.create_container()
+
+ # Execute code in container
+ exec_result = self.container.exec_run(
+ cmd=["python", "-c", code],
+ user="nobody"
+ )
+
+ # Collect all output
+ return exec_result.output.decode() if exec_result.output else None
+
+
+ def cleanup(self):
+ if self.container:
+ try:
+ self.container.stop()
+ except docker.errors.NotFound:
+ # Container already removed, this is expected
+ pass
+ except Exception as e:
+ print(f"Error during cleanup: {e}")
+ finally:
+ self.container = None # Clear the reference
+
+# Example usage:
+sandbox = DockerSandbox()
+
+try:
+ # Define your agent code
+ agent_code = """
+import os
+from smolagents import CodeAgent, InferenceClientModel
+
+# Initialize the agent
agent = CodeAgent(
- tools = [VisitWebpageTool()],
- model=HfApiModel(),
- additional_authorized_imports=["requests", "markdownify"],
- use_e2b_executor=True
+ model=InferenceClientModel(token=os.getenv("HF_TOKEN"), provider="together"),
+ tools=[]
)
-agent.run("What was Abraham Lincoln's preferred pet?")
+# Run the agent
+response = agent.run("What's the 20th Fibonacci number?")
+print(response)
+"""
+
+ # Run the code in the sandbox
+ output = sandbox.run_code(agent_code)
+ print(output)
+
+finally:
+ sandbox.cleanup()
```
-E2B code execution is not compatible with multi-agents at the moment - because having an agent call in a code blob that should be executed remotely is a mess. But we're working on adding it!
+### Best practices for sandboxes
+
+These key practices apply to both E2B and Docker sandboxes:
+
+- Resource management
+ - Set memory and CPU limits
+ - Implement execution timeouts
+ - Monitor resource usage
+- Security
+ - Run with minimal privileges
+ - Disable unnecessary network access
+ - Use environment variables for secrets
+- Environment
+ - Keep dependencies minimal
+ - Use fixed package versions
+ - If you use base images, update them regularly
+
+- Cleanup
+ - Always ensure proper cleanup of resources, especially for Docker containers, to avoid having dangling containers eating up resources.
+
+โจ By following these practices and implementing proper cleanup procedures, you can ensure your agent runs safely and efficiently in a sandboxed environment.
+
+## Comparing security approaches
+
+As illustrated in the diagram earlier, both sandboxing approaches have different security implications:
+
+### Approach 1: Running just the code snippets in a sandbox
+- **Pros**:
+ - Easier to set up with a simple parameter (`executor_type="e2b"` or `executor_type="docker"`)
+ - No need to transfer API keys to the sandbox
+ - Better protection for your local environment
+- **Cons**:
+ - Doesn't support multi-agents (managed agents)
+ - Still requires transferring state between your environment and the sandbox
+ - Limited to specific code execution
+
+### Approach 2: Running the entire agentic system in a sandbox
+- **Pros**:
+ - Supports multi-agents
+ - Complete isolation of the entire agent system
+ - More flexible for complex agent architectures
+- **Cons**:
+ - Requires more manual setup
+ - May require transferring sensitive API keys to the sandbox
+ - Potentially higher latency due to more complex operations
+
+Choose the approach that best balances your security needs with your application's requirements. For most applications with simpler agent architectures, Approach 1 provides a good balance of security and ease of use. For more complex multi-agent systems where you need full isolation, Approach 2, while more involved to set up, offers better security guarantees.
\ No newline at end of file
diff --git a/docs/source/en/tutorials/tools.mdx b/docs/source/en/tutorials/tools.mdx
index d9da1e94f..a6b24d280 100644
--- a/docs/source/en/tutorials/tools.mdx
+++ b/docs/source/en/tutorials/tools.mdx
@@ -1,18 +1,3 @@
-
# Tools
[[open-in-colab]]
@@ -82,7 +67,7 @@ In this case, you can build your tool by subclassing [`Tool`] as described above
### Share your tool to the Hub
-You can share your custom tool to the Hub by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
+You can share your custom tool to the Hub as a Space repository by calling [`~Tool.push_to_hub`] on the tool. Make sure you've created a repository for it on the Hub and are using a token with read access.
```python
model_downloads_tool.push_to_hub("{your_username}/hf-model-downloads", token="")
@@ -112,7 +97,7 @@ model_download_tool = load_tool(
### Import a Space as a tool
-You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method!
+You can directly import a Gradio Space from the Hub as a tool using the [`Tool.from_space`] method!
You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space.
@@ -131,12 +116,12 @@ And voilร , here's your image! ๐๏ธ
-Then you can use this tool just like any other tool. For example, let's improve the prompt `a rabbit wearing a space suit` and generate an image of it. This example also shows how you can pass additional arguments to the agent.
+Then you can use this tool just like any other tool. For example, let's improve the prompt `a rabbit wearing a space suit` and generate an image of it. This example also shows how you can pass additional arguments to the agent.
```python
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
-model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
agent = CodeAgent(tools=[image_generation_tool], model=model)
agent.run(
@@ -182,9 +167,9 @@ You can manage an agent's toolbox by adding or replacing a tool in attribute `ag
Let's add the `model_download_tool` to an existing agent initialized with only the default toolbox.
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
-model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
agent.tools[model_download_tool.name] = model_download_tool
@@ -204,7 +189,7 @@ agent.run(
### Use a collection of tools
-You can leverage tool collections by using the `ToolCollection` object. It supports loading either a collection from the Hub or an MCP server tools.
+You can leverage tool collections by using [`ToolCollection`]. It supports loading either a collection from the Hub or an MCP server tools.
#### Tool Collection from a collection in the Hub
@@ -229,19 +214,119 @@ To speed up the start, tools are loaded only if called by the agent.
Leverage tools from the hundreds of MCP servers available on [glama.ai](https://glama.ai/mcp/servers) or [smithery.ai](https://smithery.ai/).
-The MCP servers tools can be loaded in a `ToolCollection` object as follow:
+> [!WARNING]
+> **Security Warning:** Using MCP servers comes with security risks:
+> - **Trust is essential:** Only use MCP servers from trusted sources. Malicious servers can execute harmful code on your machine.
+> - **Stdio-based MCP servers** will always execute code on your machine (that's their intended functionality).
+> - **SSE-based MCP servers** while the remote MCP servers will not be able to execute code on your machine, still proceed with caution.
+>
+> Always verify the source and integrity of any MCP server before connecting to it, especially for production environments.
+
+The MCP servers tools can be loaded with [`ToolCollection.from_mcp`].
+For stdio-based MCP servers, pass the server parameters as an instance of `mcp.StdioServerParameters`:
```py
from smolagents import ToolCollection, CodeAgent
from mcp import StdioServerParameters
server_parameters = StdioServerParameters(
- command="uv",
+ command="uvx",
args=["--quiet", "pubmedmcp@0.1.3"],
env={"UV_PYTHON": "3.12", **os.environ},
)
-with ToolCollection.from_mcp(server_parameters) as tool_collection:
+with ToolCollection.from_mcp(server_parameters, trust_remote_code=True) as tool_collection:
+ agent = CodeAgent(tools=[*tool_collection.tools], model=model, add_base_tools=True)
+ agent.run("Please find a remedy for hangover.")
+```
+
+For SSE-based MCP servers, simply pass a dict with parameters to `mcp.client.sse.sse_client`:
+```py
+from smolagents import ToolCollection, CodeAgent
+
+with ToolCollection.from_mcp({"url": "http://127.0.0.1:8000/sse"}, trust_remote_code=True) as tool_collection:
agent = CodeAgent(tools=[*tool_collection.tools], add_base_tools=True)
agent.run("Please find a remedy for hangover.")
-```
\ No newline at end of file
+```
+
+### Use MCP tools with MCPClient directly
+
+You can also work with MCP tools by using the `MCPClient` directly, which gives you more control over the connection and tool management:
+
+For stdio-based MCP servers:
+```python
+from smolagents import MCPClient, CodeAgent
+from mcp import StdioServerParameters
+import os
+
+server_parameters = StdioServerParameters(
+ command="uvx", # Using uvx ensures dependencies are available
+ args=["--quiet", "pubmedmcp@0.1.3"],
+ env={"UV_PYTHON": "3.12", **os.environ},
+)
+
+with MCPClient(server_parameters) as tools:
+ agent = CodeAgent(tools=tools, model=model, add_base_tools=True)
+ agent.run("Please find the latest research on COVID-19 treatment.")
+```
+
+For SSE-based MCP servers:
+```python
+from smolagents import MCPClient, CodeAgent
+
+with MCPClient({"url": "http://127.0.0.1:8000/sse"}) as tools:
+ agent = CodeAgent(tools=tools, model=model, add_base_tools=True)
+ agent.run("Please find a remedy for hangover.")
+```
+
+You can also manually manage the connection lifecycle with the try...finally pattern:
+
+```python
+from smolagents import MCPClient, CodeAgent
+from mcp import StdioServerParameters
+import os
+
+# Initialize server parameters
+server_parameters = StdioServerParameters(
+ command="uvx",
+ args=["--quiet", "pubmedmcp@0.1.3"],
+ env={"UV_PYTHON": "3.12", **os.environ},
+)
+
+# Manually manage the connection
+try:
+ mcp_client = MCPClient(server_parameters)
+ tools = mcp_client.get_tools()
+
+ # Use the tools with your agent
+ agent = CodeAgent(tools=tools, model=model, add_base_tools=True)
+ result = agent.run("What are the recent therapeutic approaches for Alzheimer's disease?")
+
+ # Process the result as needed
+ print(f"Agent response: {result}")
+finally:
+ # Always ensure the connection is properly closed
+ mcp_client.disconnect()
+```
+
+You can also connect to multiple MCP servers at once by passing a list of server parameters:
+```python
+from smolagents import MCPClient, CodeAgent
+from mcp import StdioServerParameters
+import os
+
+server_params1 = StdioServerParameters(
+ command="uvx",
+ args=["--quiet", "pubmedmcp@0.1.3"],
+ env={"UV_PYTHON": "3.12", **os.environ},
+)
+
+server_params2 = {"url": "http://127.0.0.1:8000/sse"}
+
+with MCPClient([server_params1, server_params2]) as tools:
+ agent = CodeAgent(tools=tools, model=model, add_base_tools=True)
+ agent.run("Please analyze the latest research and suggest remedies for headaches.")
+```
+
+> [!WARNING]
+> **Security Warning:** The same security warnings mentioned for `ToolCollection.from_mcp` apply when using `MCPClient` directly.
diff --git a/docs/source/hi/conceptual_guides/intro_agents.mdx b/docs/source/hi/conceptual_guides/intro_agents.mdx
index 15b93798e..071df435d 100644
--- a/docs/source/hi/conceptual_guides/intro_agents.mdx
+++ b/docs/source/hi/conceptual_guides/intro_agents.mdx
@@ -1,18 +1,3 @@
-
# Agents เคเคพ เคชเคฐเคฟเคเคฏ
## ๐ค Agents เคเฅเคฏเคพ เคนเฅเค?
diff --git a/docs/source/hi/conceptual_guides/react.mdx b/docs/source/hi/conceptual_guides/react.mdx
index 0f17901e8..8c0ce0f27 100644
--- a/docs/source/hi/conceptual_guides/react.mdx
+++ b/docs/source/hi/conceptual_guides/react.mdx
@@ -1,18 +1,3 @@
-
# เคฎเคฒเฅเคเฅ-เคธเฅเคเฅเคช เคเคเฅเคเคเฅเคธ เคเฅเคธเฅ เคเคพเคฎ เคเคฐเคคเฅ เคนเฅเค?
ReAct เคซเฅเคฐเฅเคฎเคตเคฐเฅเค ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) เคตเคฐเฅเคคเคฎเคพเคจ เคฎเฅเค เคเคเฅเคเคเฅเคธ เคฌเคจเคพเคจเฅ เคเคพ เคฎเฅเคเฅเคฏ เคฆเฅเคทเฅเคเคฟเคเฅเคฃ เคนเฅเฅค
diff --git a/docs/source/hi/examples/multiagents.mdx b/docs/source/hi/examples/multiagents.mdx
index 1e9fcc745..7ee85f92d 100644
--- a/docs/source/hi/examples/multiagents.mdx
+++ b/docs/source/hi/examples/multiagents.mdx
@@ -1,18 +1,3 @@
-
# เคฎเคฒเฅเคเฅ-เคเคเฅเคเค เคธเคฟเคธเฅเคเคฎ เคเคพ เคเคฏเฅเคเคจ เคเคฐเฅเค ๐ค๐ค๐ค
[[open-in-colab]]
@@ -54,7 +39,7 @@ from huggingface_hub import login
login()
```
-โก๏ธ เคนเคฎเคพเคฐเคพ เคเคเฅเคเค [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) เคฆเฅเคตเคพเคฐเคพ เคธเคเคเคพเคฒเคฟเคค เคนเฅเคเคพ เคเฅ `HfApiModel` เคเฅเคฒเคพเคธ เคเคพ เคเคชเคฏเฅเค เคเคฐเคคเคพ เคนเฅ เคเฅ HF เคเฅ Inference API เคเคพ เคเคชเคฏเฅเค เคเคฐเคคเคพ เคนเฅ: Inference API เคเคฟเคธเฅ เคญเฅ OS เคฎเฅเคกเคฒ เคเฅ เคเคฒเฅเคฆเฅ เคเคฐ เคเคธเคพเคจเฅ เคธเฅ เคเคฒเคพเคจเฅ เคเฅ เค เคจเฅเคฎเคคเคฟ เคฆเฅเคคเคพ เคนเฅเฅค
+โก๏ธ เคนเคฎเคพเคฐเคพ เคเคเฅเคเค [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) เคฆเฅเคตเคพเคฐเคพ เคธเคเคเคพเคฒเคฟเคค เคนเฅเคเคพ เคเฅ `InferenceClientModel` เคเฅเคฒเคพเคธ เคเคพ เคเคชเคฏเฅเค เคเคฐเคคเคพ เคนเฅ เคเฅ HF เคเฅ Inference API เคเคพ เคเคชเคฏเฅเค เคเคฐเคคเคพ เคนเฅ: Inference API เคเคฟเคธเฅ เคญเฅ OS เคฎเฅเคกเคฒ เคเฅ เคเคฒเฅเคฆเฅ เคเคฐ เคเคธเคพเคจเฅ เคธเฅ เคเคฒเคพเคจเฅ เคเฅ เค เคจเฅเคฎเคคเคฟ เคฆเฅเคคเคพ เคนเฅเฅค
_เคจเฅเค:_ The Inference API เคตเคฟเคญเคฟเคจเฅเคจ เคฎเคพเคจเคฆเคเคกเฅเค เคเฅ เคเคงเคพเคฐ เคชเคฐ เคฎเฅเคกเคฒ เคนเฅเคธเฅเค เคเคฐเคคเคพ เคนเฅ, เคเคฐ เคกเคฟเคชเฅเคฒเฅเคฏ เคเคฟเค เคเค เคฎเฅเคกเคฒ เคฌเคฟเคจเคพ เคชเฅเคฐเฅเคต เคธเฅเคเคจเคพ เคเฅ เค เคชเคกเฅเค เคฏเคพ เคฌเคฆเคฒเฅ เคเคพ เคธเคเคคเฅ เคนเฅเคเฅค เคเคธเคเฅ เคฌเคพเคฐเฅ เคฎเฅเค เค เคงเคฟเค เคเคพเคจเฅเค [เคฏเคนเคพเค](https://huggingface.co/docs/api-inference/supported-models)เฅค
@@ -126,13 +111,13 @@ print(visit_webpage("https://en.wikipedia.org/wiki/Hugging_Face")[:500])
from smolagents import (
CodeAgent,
ToolCallingAgent,
- HfApiModel,
+ InferenceClientModel,
ManagedAgent,
DuckDuckGoSearchTool,
LiteLLMModel,
)
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
web_agent = ToolCallingAgent(
tools=[DuckDuckGoSearchTool(), visit_webpage],
diff --git a/docs/source/hi/examples/rag.mdx b/docs/source/hi/examples/rag.mdx
index 9e7a0e595..478080d8b 100644
--- a/docs/source/hi/examples/rag.mdx
+++ b/docs/source/hi/examples/rag.mdx
@@ -1,18 +1,3 @@
-
# เคเคเฅเคเคเคฟเค RAG
[[open-in-colab]]
@@ -135,10 +120,10 @@ retriever_tool = RetrieverTool(docs_processed)
_เคจเฅเค:_ Inference API เคตเคฟเคญเคฟเคจเฅเคจ เคฎเคพเคจเคฆเคเคกเฅเค เคเฅ เคเคงเคพเคฐ เคชเคฐ เคฎเฅเคกเคฒ เคนเฅเคธเฅเค เคเคฐเคคเคพ เคนเฅ, เคเคฐ เคกเคฟเคชเฅเคฒเฅเคฏ เคเคฟเค เคเค เคฎเฅเคกเคฒ เคฌเคฟเคจเคพ เคชเฅเคฐเฅเคต เคธเฅเคเคจเคพ เคเฅ เค เคชเคกเฅเค เคฏเคพ เคฌเคฆเคฒเฅ เคเคพ เคธเคเคคเฅ เคนเฅเคเฅค เคเคธเคเฅ เคฌเคพเคฐเฅ เคฎเฅเค เค เคงเคฟเค เคเคพเคจเฅเค [เคฏเคนเคพเค](https://huggingface.co/docs/api-inference/supported-models) เคชเคขเคผเฅเคเฅค
```py
-from smolagents import HfApiModel, CodeAgent
+from smolagents import InferenceClientModel, CodeAgent
agent = CodeAgent(
- tools=[retriever_tool], model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbosity_level=2
+ tools=[retriever_tool], model=InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbosity_level=2
)
```
diff --git a/docs/source/hi/examples/text_to_sql.mdx b/docs/source/hi/examples/text_to_sql.mdx
index 213821ac8..69fc9820c 100644
--- a/docs/source/hi/examples/text_to_sql.mdx
+++ b/docs/source/hi/examples/text_to_sql.mdx
@@ -1,19 +1,4 @@
-
-# Text-to-SQL
+# Text-to-SQL
[[open-in-colab]]
@@ -125,14 +110,14 @@ def sql_engine(query: str) -> str:
เคนเคฎ `CodeAgent` เคเคพ เคเคชเคฏเฅเค เคเคฐเคคเฅ เคนเฅเค, เคเฅ smolagents เคเคพ เคฎเฅเคเฅเคฏ เคเคเฅเคเค เคเฅเคฒเคพเคธ เคนเฅ: เคเค เคเคเฅเคเค เคเฅ เคเฅเคก เคฎเฅเค เคเคเฅเคถเคจ เคฒเคฟเคเคคเคพ เคนเฅ เคเคฐ ReAct เคซเฅเคฐเฅเคฎเคตเคฐเฅเค เคเฅ เค เคจเฅเคธเคพเคฐ เคชเคฟเคเคฒเฅ เคเคเคเคชเฅเค เคชเคฐ เคชเฅเคจเคฐเคพเคตเฅเคคเฅเคคเคฟ เคเคฐ เคธเคเคคเคพ เคนเฅเฅค
-เคฎเฅเคกเคฒ เคตเคน LLM เคนเฅ เคเฅ เคเคเฅเคเค เคธเคฟเคธเฅเคเคฎ เคเฅ เคธเคเคเคพเคฒเคฟเคค เคเคฐเคคเคพ เคนเฅเฅค `HfApiModel` เคเคชเคเฅ HF เคเฅ Inference API เคเคพ เคเคชเคฏเฅเค เคเคฐเคเฅ LLM เคเฅ เคเฅเคฒ เคเคฐเคจเฅ เคเฅ เค เคจเฅเคฎเคคเคฟ เคฆเฅเคคเคพ เคนเฅ, เคฏเคพ เคคเฅ เคธเคฐเฅเคตเคฐเคฒเฅเคธ เคฏเคพ เคกเฅเคกเคฟเคเฅเคเฅเคก เคเคเคกเคชเฅเคเคเค เคเฅ เคฎเคพเคงเฅเคฏเคฎ เคธเฅ, เคฒเฅเคเคฟเคจ เคเคช เคเคฟเคธเฅ เคญเฅ เคชเฅเคฐเฅเคชเฅเคฐเคพเคเคเคฐเฅ API เคเคพ เคญเฅ เคเคชเคฏเฅเค เคเคฐ เคธเคเคคเฅ เคนเฅเคเฅค
+เคฎเฅเคกเคฒ เคตเคน LLM เคนเฅ เคเฅ เคเคเฅเคเค เคธเคฟเคธเฅเคเคฎ เคเฅ เคธเคเคเคพเคฒเคฟเคค เคเคฐเคคเคพ เคนเฅเฅค `InferenceClientModel` เคเคชเคเฅ HF เคเฅ Inference API เคเคพ เคเคชเคฏเฅเค เคเคฐเคเฅ LLM เคเฅ เคเฅเคฒ เคเคฐเคจเฅ เคเฅ เค เคจเฅเคฎเคคเคฟ เคฆเฅเคคเคพ เคนเฅ, เคฏเคพ เคคเฅ เคธเคฐเฅเคตเคฐเคฒเฅเคธ เคฏเคพ เคกเฅเคกเคฟเคเฅเคเฅเคก เคเคเคกเคชเฅเคเคเค เคเฅ เคฎเคพเคงเฅเคฏเคฎ เคธเฅ, เคฒเฅเคเคฟเคจ เคเคช เคเคฟเคธเฅ เคญเฅ เคชเฅเคฐเฅเคชเฅเคฐเคพเคเคเคฐเฅ API เคเคพ เคญเฅ เคเคชเคฏเฅเค เคเคฐ เคธเคเคคเฅ เคนเฅเคเฅค
```py
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("meta-llama/Meta-Llama-3.1-8B-Instruct"),
+ model=InferenceClientModel(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"),
)
agent.run("Can you give me the name of the client who got the most expensive receipt?")
```
@@ -188,7 +173,7 @@ sql_engine.description = updated_description
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct"),
+ model=InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"),
)
agent.run("Which waiter got more total money from tips?")
diff --git a/docs/source/hi/guided_tour.mdx b/docs/source/hi/guided_tour.mdx
index 745b6643a..1c7f5742e 100644
--- a/docs/source/hi/guided_tour.mdx
+++ b/docs/source/hi/guided_tour.mdx
@@ -1,18 +1,3 @@
-
# Agents - เคเคพเคเคกเฅเคก เคเฅเคฐ
[[open-in-colab]]
@@ -25,7 +10,7 @@ rendered properly in your Markdown viewer.
- `model`, เคเคชเคเฅ เคเคเฅเคเค เคเฅ เคชเคพเคตเคฐ เคฆเฅเคจเฅ เคเฅ เคฒเคฟเค เคเค เคเฅเคเฅเคธเฅเค-เคเคจเคฐเฅเคถเคจ เคฎเฅเคกเคฒ - เคเฅเคฏเฅเคเคเคฟ เคเคเฅเคเค เคเค เคธเคฟเคเคชเคฒ LLM เคธเฅ เค เคฒเค เคนเฅ, เคฏเคน เคเค เคธเคฟเคธเฅเคเคฎ เคนเฅ เคเฅ LLM เคเฅ เค เคชเคจเฅ เคเคเคเคจ เคเฅ เคฐเฅเคช เคฎเฅเค เคเคชเคฏเฅเค เคเคฐเคคเคพ เคนเฅเฅค เคเคช เคเคจเคฎเฅเค เคธเฅ เคเฅเค เคญเฅ เคตเคฟเคเคฒเฅเคช เคเคชเคฏเฅเค เคเคฐ เคธเคเคคเฅ เคนเฅเค:
- [`TransformersModel`] `transformers` เคชเคพเคเคชเคฒเคพเคเคจ เคเฅ เคชเคนเคฒเฅ เคธเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเคผ เคเคฐเคคเคพ เคนเฅ เคเฅ `transformers` เคเคพ เคเคชเคฏเฅเค เคเคฐเคเฅ เคเคชเคเฅ เคฒเฅเคเคฒ เคฎเคถเฅเคจ เคชเคฐ เคเคจเฅเคซเคฐเฅเคเคธ เคเคฒเคพเคจเฅ เคเฅ เคฒเคฟเค เคนเฅเคคเคพ เคนเฅเฅค
- - [`HfApiModel`] เค เคเคฆเคฐ เคธเฅ `huggingface_hub.InferenceClient` เคเคพ เคฒเคพเคญ เคเค เคพเคคเคพ เคนเฅเฅค
+ - [`InferenceClientModel`] เค เคเคฆเคฐ เคธเฅ `huggingface_hub.InferenceClient` เคเคพ เคฒเคพเคญ เคเค เคพเคคเคพ เคนเฅเฅค
- [`LiteLLMModel`] เคเคชเคเฅ [LiteLLM](https://docs.litellm.ai/) เคเฅ เคฎเคพเคงเฅเคฏเคฎ เคธเฅ 100+ เค เคฒเค-เค เคฒเค เคฎเฅเคกเคฒเฅเคธ เคเฅ เคเฅเคฒ เคเคฐเคจเฅ เคฆเฅเคคเคพ เคนเฅ!
- `tools`, `Tools` เคเฅ เคเค เคฒเคฟเคธเฅเค เคเคฟเคธเฅ เคเคเฅเคเค เคเคพเคธเฅเค เคเฅ เคนเคฒ เคเคฐเคจเฅ เคเฅ เคฒเคฟเค เคเคชเคฏเฅเค เคเคฐ เคธเคเคคเคพ เคนเฅเฅค เคฏเคน เคเค เคเคพเคฒเฅ เคฒเคฟเคธเฅเค เคนเฅ เคธเคเคคเฅ เคนเฅเฅค เคเคช เคเคชเฅเคถเคจเคฒ เคเคฐเฅเคเฅเคฏเฅเคฎเฅเคเค `add_base_tools=True` เคเฅ เคชเคฐเคฟเคญเคพเคทเคฟเคค เคเคฐเคเฅ เค เคชเคจเฅ `tools` เคฒเคฟเคธเฅเค เคเฅ เคเคชเคฐ เคกเคฟเคซเคผเฅเคฒเฅเค เคเฅเคฒเคฌเฅเคเฅเคธ เคญเฅ เคเฅเคกเคผ เคธเคเคคเฅ เคนเฅเคเฅค
@@ -37,14 +22,14 @@ rendered properly in your Markdown viewer.
Hugging Face API เคเฅเคเคจ เคเฅ เคฌเคฟเคจเคพ เคเคชเคฏเฅเค เคเคฐเคจเฅ เคเฅ เคฒเคฟเค เคฎเฅเคซเฅเคค เคนเฅ, เคฒเฅเคเคฟเคจ เคซเคฟเคฐ เคเคธเคฎเฅเค เคฐเฅเค เคฒเคฟเคฎเคฟเคเฅเคถเคจ เคนเฅเคเฅเฅค
-เคเฅเคเฅเคก เคฎเฅเคกเคฒเฅเคธ เคคเค เคชเคนเฅเคเคเคจเฅ เคฏเคพ PRO เค เคเคพเคเคเค เคเฅ เคธเคพเคฅ เค เคชเคจเฅ เคฐเฅเค เคฒเคฟเคฎเคฟเคเฅเคธ เคฌเคขเคผเคพเคจเฅ เคเฅ เคฒเคฟเค, เคเคชเคเฅ เคเคจเคตเคพเคฏเคฐเคจเคฎเฅเคเค เคตเฅเคฐเคฟเคเคฌเคฒ `HF_TOKEN` เคธเฅเค เคเคฐเคจเคพ เคนเฅเคเคพ เคฏเคพ `HfApiModel` เคเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ `token` เคตเฅเคฐเคฟเคเคฌเคฒ เคชเคพเคธ เคเคฐเคจเคพ เคนเฅเคเคพเฅค
+เคเฅเคเฅเคก เคฎเฅเคกเคฒเฅเคธ เคคเค เคชเคนเฅเคเคเคจเฅ เคฏเคพ PRO เค เคเคพเคเคเค เคเฅ เคธเคพเคฅ เค เคชเคจเฅ เคฐเฅเค เคฒเคฟเคฎเคฟเคเฅเคธ เคฌเคขเคผเคพเคจเฅ เคเฅ เคฒเคฟเค, เคเคชเคเฅ เคเคจเคตเคพเคฏเคฐเคจเคฎเฅเคเค เคตเฅเคฐเคฟเคเคฌเคฒ `HF_TOKEN` เคธเฅเค เคเคฐเคจเคพ เคนเฅเคเคพ เคฏเคพ `InferenceClientModel` เคเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ `token` เคตเฅเคฐเคฟเคเคฌเคฒ เคชเคพเคธ เคเคฐเคจเคพ เคนเฅเคเคพเฅค
```python
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = HfApiModel(model_id=model_id, token="")
+model = InferenceClientModel(model_id=model_id, token="")
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
agent.run(
@@ -114,7 +99,7 @@ agent.run(
เคเคช เค เคชเคจเฅ [`CodeAgent`] เคเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ เคเคฐเฅเคเฅเคฏเฅเคฎเฅเคเค `additional_authorized_imports` เคฎเฅเค เคธเฅเคเฅเคฐเคฟเคเคเฅเคธ เคเฅ เคฒเคฟเคธเฅเค เคเฅ เคฐเฅเคช เคฎเฅเค เค เคคเคฟเคฐเคฟเคเฅเคค เคฎเฅเคกเฅเคฏเฅเคฒเฅเคธ เคเฅ เค เคงเคฟเคเฅเคค เคเคฐ เคธเคเคคเฅ เคนเฅเคเฅค
```py
-model = HfApiModel()
+model = InferenceClientModel()
agent = CodeAgent(tools=[], model=model, additional_authorized_imports=['requests', 'bs4'])
agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
```
@@ -124,7 +109,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co
เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ เคเคฟเคธเฅ เคญเฅ เคเฅเคก เคชเคฐ เคฐเฅเค เคเคพเคเคเคพ เคเฅ เคเค เค เคตเฅเคง เคเคชเคฐเฅเคถเคจ เคเคฐเคจเฅ เคเคพ เคชเฅเคฐเคฏเคพเคธ เคเคฐเคคเคพ เคนเฅ เคฏเคพ เคฏเคฆเคฟ เคเคเฅเคเค เคฆเฅเคตเคพเคฐเคพ เคเคจเคฐเฅเค เคเคฟเค เคเค เคเฅเคก เคฎเฅเค เคเค เคฐเฅเคเฅเคฒเคฐ เคชเคพเคฏเคฅเคจ เคเคฐเคฐ เคนเฅเฅค
-เคเคช [E2B เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเคเคฐ](https://e2b.dev/docs#what-is-e2-b) เคเคพ เคเคชเคฏเฅเค เคฒเฅเคเคฒ เคชเคพเคฏเคฅเคจ เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ เคเฅ เคฌเคเคพเคฏ เคเคฐ เคธเคเคคเฅ เคนเฅเค, เคชเคนเคฒเฅ [`E2B_API_KEY` เคเคจเคตเคพเคฏเคฐเคจเคฎเฅเคเค เคตเฅเคฐเคฟเคเคฌเคฒ เคธเฅเค เคเคฐเคเฅ](https://e2b.dev/dashboard?tab=keys) เคเคฐ เคซเคฟเคฐ เคเคเฅเคเค เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ `use_e2b_executor=True` เคชเคพเคธ เคเคฐเคเฅเฅค
+เคเคช [E2B เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเคเคฐ](https://e2b.dev/docs#what-is-e2-b) เคฏเคพ Docker เคเคพ เคเคชเคฏเฅเค เคฒเฅเคเคฒ เคชเคพเคฏเคฅเคจ เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ เคเฅ เคฌเคเคพเคฏ เคเคฐ เคธเคเคคเฅ เคนเฅเคเฅค E2B เคเฅ เคฒเคฟเค, เคชเคนเคฒเฅ [`E2B_API_KEY` เคเคจเคตเคพเคฏเคฐเคจเคฎเฅเคเค เคตเฅเคฐเคฟเคเคฌเคฒ เคธเฅเค เคเคฐเฅเค](https://e2b.dev/dashboard?tab=keys) เคเคฐ เคซเคฟเคฐ เคเคเฅเคเค เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ `executor_type="e2b"` เคชเคพเคธ เคเคฐเฅเคเฅค Docker เคเฅ เคฒเคฟเค, เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคเฅ เคฆเฅเคฐเคพเคจ `executor_type="docker"` เคชเคพเคธ เคเคฐเฅเคเฅค
> [!TIP]
> เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ เคเฅ เคฌเคพเคฐเฅ เคฎเฅเค เคเคฐ เคเคพเคจเฅเค [เคเคธ เคเฅเคฏเฅเคเฅเคฐเคฟเคฏเคฒ เคฎเฅเค](tutorials/secure_code_execution)เฅค
@@ -158,7 +143,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co
### เคกเคฟเคซเคผเฅเคฒเฅเค เคเฅเคฒเคฌเฅเคเฅเคธ
-`smolagents` เคเคเฅเคเคเฅเคธ เคเฅ เคธเคถเคเฅเคค เคฌเคจเคพเคจเฅ เคเฅ เคฒเคฟเค เคเค เคกเคฟเคซเคผเฅเคฒเฅเค เคเฅเคฒเคฌเฅเคเฅเคธ เคเฅ เคธเคพเคฅ เคเคคเคพ เคนเฅ, เคเคฟเคธเฅ เคเคช เคเคฐเฅเคเฅเคฏเฅเคฎเฅเคเค `add_base_tools = True` เคเฅ เคธเคพเคฅ เค เคชเคจเฅ เคเคเฅเคเค เคฎเฅเค เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ เคเฅเคกเคผ เคธเคเคคเฅ เคนเฅเค:
+`smolagents` เคเคเฅเคเคเฅเคธ เคเฅ เคธเคถเคเฅเคค เคฌเคจเคพเคจเฅ เคเฅ เคฒเคฟเค เคเค เคกเคฟเคซเคผเฅเคฒเฅเค เคเฅเคฒเคฌเฅเคเฅเคธ เคเฅ เคธเคพเคฅ เคเคคเคพ เคนเฅ, เคเคฟเคธเฅ เคเคช เคเคฐเฅเคเฅเคฏเฅเคฎเฅเคเค `add_base_tools=True` เคเฅ เคธเคพเคฅ เค เคชเคจเฅ เคเคเฅเคเค เคฎเฅเค เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคชเคฐ เคเฅเคกเคผ เคธเคเคคเฅ เคนเฅเค:
- **DuckDuckGo เคตเฅเคฌ เคธเคฐเฅเค**: DuckDuckGo เคฌเฅเคฐเคพเคเคเคผเคฐ เคเคพ เคเคชเคฏเฅเค เคเคฐเคเฅ เคตเฅเคฌ เคธเคฐเฅเค เคเคฐเคคเคพ เคนเฅเฅค
- **เคชเคพเคฏเคฅเคจ เคเฅเคก เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ**: เคเคชเคเคพ LLM เคเคจเคฐเฅเคเฅเคก เคชเคพเคฏเคฅเคจ เคเฅเคก เคเค เคธเฅเคฐเคเฅเคทเคฟเคค เคเคจเคตเคพเคฏเคฐเคจเคฎเฅเคเค เคฎเฅเค เคเคฒเคพเคคเคพ เคนเฅเฅค เคฏเคน เคเฅเคฒ [`ToolCallingAgent`] เคฎเฅเค เคเฅเคตเคฒ เคคเคญเฅ เคเฅเคกเคผเคพ เคเคพเคเคเคพ เคเคฌ เคเคช เคเคธเฅ `add_base_tools=True` เคเฅ เคธเคพเคฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเคผ เคเคฐเคคเฅ เคนเฅเค, เคเฅเคฏเฅเคเคเคฟ เคเฅเคก-เคฌเฅเคธเฅเคก เคเคเฅเคเค เคชเคนเคฒเฅ เคธเฅ เคนเฅ เคจเฅเคเคฟเคต เคฐเฅเคช เคธเฅ เคชเคพเคฏเคฅเคจ เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเค เคเคฐ เคธเคเคคเคพ เคนเฅ
@@ -250,8 +235,8 @@ class ModelDownloadTool(Tool):
เคเคช เคธเฅเคงเฅ เค เคชเคจเฅ เคเคเฅเคเค เคเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเคผ เคเคฐ เคธเคเคคเฅ เคนเฅเค:
```py
-from smolagents import CodeAgent, HfApiModel
-agent = CodeAgent(tools=[model_download_tool], model=HfApiModel())
+from smolagents import CodeAgent, InferenceClientModel
+agent = CodeAgent(tools=[model_download_tool], model=InferenceClientModel())
agent.run(
"Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
)
@@ -264,7 +249,7 @@ agent.run(
โ Can you give me the name of the model that has the most downloads in the 'text-to-video' โ
โ task on the Hugging Face Hub? โ
โ โ
-โฐโ HfApiModel - Qwen/Qwen2.5-Coder-32B-Instruct โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
+โฐโ InferenceClientModel - Qwen/Qwen2.5-Coder-32B-Instruct โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ Step 0 โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โญโ Executing this code: โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
โ 1 model_name = model_download_tool(task="text-to-video") โ
@@ -301,9 +286,9 @@ Microsoft เคเฅ เคซเฅเคฐเฅเคฎเคตเคฐเฅเค [Autogen](https://huggingface.co/pa
เคฏเคนเคพเค เคเค เคเคเฅเคเค เคฌเคจเคพเคจเฅ เคเคพ เคเคฆเคพเคนเคฐเคฃ เคฆเคฟเคฏเคพ เคเคฏเคพ เคนเฅ เคเฅ เคนเคฎเคพเคฐเฅ [`DuckDuckGoSearchTool`] เคเคพ เคเคชเคฏเฅเค เคเคฐเคเฅ เคเค เคตเคฟเคถเคฟเคทเฅเค เคตเฅเคฌ เคเฅเค เคเคเฅเคเค เคเฅ เคชเฅเคฐเคฌเคเคงเคฟเคค เคเคฐเคคเคพ เคนเฅเฅค
```py
-from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool, ManagedAgent
+from smolagents import CodeAgent, InferenceClientModel, DuckDuckGoSearchTool, ManagedAgent
-model = HfApiModel()
+model = InferenceClientModel()
web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)
@@ -332,14 +317,14 @@ manager_agent.run("Who is the CEO of Hugging Face?")
from smolagents import (
load_tool,
CodeAgent,
- HfApiModel,
+ InferenceClientModel,
GradioUI
)
# Import tool from Hub
image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True)
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
# Initialize the agent with the image generation tool
agent = CodeAgent(tools=[image_generation_tool], model=model)
diff --git a/docs/source/hi/index.mdx b/docs/source/hi/index.mdx
index 533b3b62d..40c938b55 100644
--- a/docs/source/hi/index.mdx
+++ b/docs/source/hi/index.mdx
@@ -1,18 +1,3 @@
-
-
# `smolagents`
diff --git a/docs/source/hi/reference/agents.mdx b/docs/source/hi/reference/agents.mdx
index 2e070cf03..95e097560 100644
--- a/docs/source/hi/reference/agents.mdx
+++ b/docs/source/hi/reference/agents.mdx
@@ -1,18 +1,3 @@
-
# Agents
@@ -98,12 +83,12 @@ print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"]))
[[autodoc]] TransformersModel
-### HfApiModel
+### InferenceClientModel
-`HfApiModel` LLM เคเฅ เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ เคเฅ เคฒเคฟเค [HF Inference API](https://huggingface.co/docs/api-inference/index) เคเฅเคฒเคพเคเคเค เคเฅ เคฐเฅเคช เคเคฐเคคเคพ เคนเฅเฅค
+`InferenceClientModel` LLM เคเฅ เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ เคเฅ เคฒเคฟเค [HF Inference API](https://huggingface.co/docs/api-inference/index) เคเฅเคฒเคพเคเคเค เคเฅ เคฐเฅเคช เคเคฐเคคเคพ เคนเฅเฅค
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
messages = [
{"role": "user", "content": "Hello, how are you?"},
@@ -111,13 +96,13 @@ messages = [
{"role": "user", "content": "No need to help, take it easy."},
]
-model = HfApiModel()
+model = InferenceClientModel()
print(model(messages))
```
```text
>>> Of course! If you change your mind, feel free to reach out. Take care!
```
-[[autodoc]] HfApiModel
+[[autodoc]] InferenceClientModel
### LiteLLMModel
@@ -133,7 +118,7 @@ messages = [
{"role": "user", "content": "No need to help, take it easy."},
]
-model = LiteLLMModel("anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10)
+model = LiteLLMModel(model_id="anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10)
print(model(messages))
```
diff --git a/docs/source/hi/reference/tools.mdx b/docs/source/hi/reference/tools.mdx
index 6c270321e..d7e0de98c 100644
--- a/docs/source/hi/reference/tools.mdx
+++ b/docs/source/hi/reference/tools.mdx
@@ -1,18 +1,3 @@
-
# Tools
diff --git a/docs/source/hi/tutorials/building_good_agents.mdx b/docs/source/hi/tutorials/building_good_agents.mdx
index 92587ef35..0baa206f6 100644
--- a/docs/source/hi/tutorials/building_good_agents.mdx
+++ b/docs/source/hi/tutorials/building_good_agents.mdx
@@ -1,18 +1,3 @@
-
# เค เคเฅเคเฅ Agents เคเคพ เคจเคฟเคฐเฅเคฎเคพเคฃ
[[open-in-colab]]
@@ -122,11 +107,11 @@ def get_weather_api(location: str, date_time: str) -> str:
```py
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
model_id = "meta-llama/Llama-3.3-70B-Instruct"
-agent = CodeAgent(tools=[], model=HfApiModel(model_id=model_id), add_base_tools=True)
+agent = CodeAgent(tools=[], model=InferenceClientModel(model_id=model_id), add_base_tools=True)
agent.run(
"Why does Mike not know many people in New York?",
@@ -211,13 +196,152 @@ In the end you have to return a final answer using the `final_answer` tool.
Here are a few examples using notional tools:
---
-{examples}
+Task: "Generate an image of the oldest person in this document."
-Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+Code:
+```py
+answer = document_qa(document=document, question="Who is the oldest person mentioned?")
+print(answer)
+```
+Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
-{{tool_descriptions}}
+Thought: I will now generate an image showcasing the oldest person.
+Code:
+```py
+image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
+final_answer(image)
+```
+
+---
+Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
-{{managed_agents_descriptions}}
+Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
+Code:
+```py
+result = 5 + 3 + 1294.678
+final_answer(result)
+```
+
+---
+Task:
+"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
+You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
+{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
+
+Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+Code:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+final_answer(f"The answer is {answer}")
+```
+
+---
+Task:
+In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
+What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
+
+Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
+Code:
+```py
+pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
+print(pages)
+```
+Observation:
+No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
+
+Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
+Code:
+```py
+pages = search(query="1979 interview Stanislaus Ulam")
+print(pages)
+```
+Observation:
+Found 6 pages:
+[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
+
+[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
+
+(truncated)
+
+Thought: I will read the first 2 pages to know more.
+Code:
+```py
+for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
+ whole_page = visit_webpage(url)
+ print(whole_page)
+ print("\n" + "="*80 + "\n") # Print separator between pages
+```
+Observation:
+Manhattan Project Locations:
+Los Alamos, NM
+Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
+(truncated)
+
+Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
+Code:
+```py
+final_answer("diminished")
+```
+
+---
+Task: "Which city has the highest population: Guangzhou or Shanghai?"
+
+Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
+Code:
+```py
+for city in ["Guangzhou", "Shanghai"]:
+ print(f"Population {city}:", search(f"{city} population")
+```
+Observation:
+Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
+Population Shanghai: '26 million (2019)'
+
+Thought: Now I know that Shanghai has the highest population.
+Code:
+```py
+final_answer("Shanghai")
+```
+
+---
+Task: "What is the current age of the pope, raised to the power 0.36?"
+
+Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
+Code:
+```py
+pope_age_wiki = wiki(query="current pope age")
+print("Pope age as per wikipedia:", pope_age_wiki)
+pope_age_search = web_search(query="current pope age")
+print("Pope age as per google search:", pope_age_search)
+```
+Observation:
+Pope age: "The pope Francis is currently 88 years old."
+
+Thought: I know that the pope is 88 years old. Let's compute the result using python code.
+Code:
+```py
+pope_current_age = 88 ** 0.36
+final_answer(pope_current_age)
+```
+
+Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+ Takes inputs: {{tool.inputs}}
+ Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+
+{%- if managed_agents and managed_agents.values() | list %}
+You can also give tasks to team members.
+Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+Given that this team member is a real human, you should be very verbose in your task.
+Here is a list of the team members that you can call:
+{%- for agent in managed_agents.values() %}
+- {{ agent.name }}: {{ agent.description }}
+{%- endfor %}
+{%- endif %}
Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```' sequence, else you will fail.
@@ -226,7 +350,7 @@ Here are the rules you should always follow to solve your task:
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
-7. Never create any notional variables in our code, as having these in your logs might derail you from the true variables.
+7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
@@ -234,11 +358,29 @@ Here are the rules you should always follow to solve your task:
Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
```
-เคเฅเคธเคพ เคเคฟ เคเคช เคฆเฅเค เคธเคเคคเฅ เคนเฅเค, `"{{tool_descriptions}}"` เคเฅเคธเฅ เคชเฅเคฒเฅเคธเคนเฅเคฒเฅเคกเคฐเฅเคธ เคนเฅเค: เคเคจเคเคพ เคเคชเคฏเฅเค เคเคเฅเคเค เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคเฅ เคธเคฎเคฏ เคเฅเคฒเฅเคธ เคฏเคพ เคฎเฅเคจเฅเคเฅเคก เคเคเฅเคเคเฅเคธ เคเฅ เคเฅเค เคธเฅเคตเคเคพเคฒเคฟเคค เคฐเฅเคช เคธเฅ เคเคจเคฐเฅเค เคเคฟเค เคเค เคตเคฟเคตเคฐเคฃเฅเค เคเฅ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเค เคเคฟเคฏเคพ เคเคพเคเคเคพเฅค
+เคเฅเคธเคพ เคเคฟ เคเคช เคฆเฅเค เคธเคเคคเฅ เคนเฅเค, `"{{ tool.description }}"` เคเฅเคธเฅ เคชเฅเคฒเฅเคธเคนเฅเคฒเฅเคกเคฐเฅเคธ เคนเฅเค: เคเคจเคเคพ เคเคชเคฏเฅเค เคเคเฅเคเค เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเฅเคถเคจ เคเฅ เคธเคฎเคฏ เคเฅเคฒเฅเคธ เคฏเคพ เคฎเฅเคจเฅเคเฅเคก เคเคเฅเคเคเฅเคธ เคเฅ เคเฅเค เคธเฅเคตเคเคพเคฒเคฟเคค เคฐเฅเคช เคธเฅ เคเคจเคฐเฅเค เคเคฟเค เคเค เคตเคฟเคตเคฐเคฃเฅเค เคเฅ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเค เคเคฟเคฏเคพ เคเคพเคเคเคพเฅค
เคเคธเคฒเคฟเค เคเคฌเคเคฟ เคเคช `system_prompt` เคชเฅเคฐเคพเคฎเฅเคเคฐ เคฎเฅเค เค เคชเคจเฅ เคเคธเฅเคเคฎ เคชเฅเคฐเฅเคฎเฅเคชเฅเค เคเฅ เคเคฐเฅเคเฅเคฎเฅเคเค เคเฅ เคฐเฅเคช เคฎเฅเค เคชเคพเคธ เคเคฐเคเฅ เคเคธ เคธเคฟเคธเฅเคเคฎ เคชเฅเคฐเฅเคฎเฅเคชเฅเค เคเฅเคฎเฅเคชเคฒเฅเค เคเฅ เคเคตเคฐเคฐเคพเคเค เคเคฐ เคธเคเคคเฅ เคนเฅเค, เคเคชเคเฅ เคจเค เคธเคฟเคธเฅเคเคฎ เคชเฅเคฐเฅเคฎเฅเคชเฅเค เคฎเฅเค เคจเคฟเคฎเฅเคจเคฒเคฟเคเคฟเคค เคชเฅเคฒเฅเคธเคนเฅเคฒเฅเคกเคฐเฅเคธ เคนเฅเคจเฅ เคเคพเคนเคฟเค:
-- เคเฅเคฒ เคตเคฟเคตเคฐเคฃ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเค `"{{tool_descriptions}}"`เฅค
-- เคฏเคฆเคฟ เคเฅเค เคฎเฅเคจเฅเคเฅเคก เคเคเฅเคเคเฅเคธ เคนเฅเค เคคเฅ เคเคจเคเฅ เคฒเคฟเค เคตเคฟเคตเคฐเคฃ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเค `"{{managed_agents_description}}"`เฅค
+- เคเฅเคฒ เคตเคฟเคตเคฐเคฃ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเคเฅค
+ ```
+ {%- for tool in tools.values() %}
+ - {{ tool.name }}: {{ tool.description }}
+ Takes inputs: {{tool.inputs}}
+ Returns an output of type: {{tool.output_type}}
+ {%- endfor %}
+ ```
+- เคฏเคฆเคฟ เคเฅเค เคฎเฅเคจเฅเคเฅเคก เคเคเฅเคเคเฅเคธ เคนเฅเค เคคเฅ เคเคจเคเฅ เคฒเคฟเค เคตเคฟเคตเคฐเคฃ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเคเฅค
+ ```
+ {%- if managed_agents and managed_agents.values() | list %}
+ You can also give tasks to team members.
+ Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+ Given that this team member is a real human, you should be very verbose in your task.
+ Here is a list of the team members that you can call:
+ {%- for agent in managed_agents.values() %}
+ - {{ agent.name }}: {{ agent.description }}
+ {%- endfor %}
+ {%- endif %}
+ ```
- เคเฅเคตเคฒ `CodeAgent` เคเฅ เคฒเคฟเค: เค เคงเคฟเคเฅเคค เคเคฎเฅเคชเฅเคฐเฅเคเฅเคธ เคเฅ เคธเฅเคเฅ เคกเคพเคฒเคจเฅ เคเฅ เคฒเคฟเค `"{{authorized_imports}}"`เฅค
เคซเคฟเคฐ เคเคช เคธเคฟเคธเฅเคเคฎ เคชเฅเคฐเฅเคฎเฅเคชเฅเค เคเฅ เคจเคฟเคฎเฅเคจเคพเคจเฅเคธเคพเคฐ เคฌเคฆเคฒ เคธเคเคคเฅ เคนเฅเค:
@@ -255,7 +397,7 @@ This also works with the [`ToolCallingAgent`].
เคนเคฎ เคชเฅเคฐเค เคฏเฅเคเคจเคพ เคเคฐเคฃ เคเฅ เคฒเคฟเค เคเค เคฎเฅเคกเคฒ เคชเฅเคฐเคฆเคพเคจ เคเคฐเคคเฅ เคนเฅเค, เคเคฟเคธเฅ เคเคเฅเคเค เคธเคพเคฎเคพเคจเฅเคฏ เคเฅเคฐเคฟเคฏเคพเคเค เคเฅ เคเคฐเคฃเฅเค เคเฅ เคฌเฅเค เคจเคฟเคฏเคฎเคฟเคค เคฐเฅเคช เคธเฅ เคเคฒเคพ เคธเคเคคเคพ เคนเฅเฅค เคเคธ เคเคฐเคฃ เคฎเฅเค เคเฅเค เคเฅเคฒ เคเฅเคฒ เคจเคนเฅเค เคนเฅเคคเฅ เคนเฅ, LLM เคธเฅ เคเฅเคตเคฒ เคเคจ เคคเคฅเฅเคฏเฅเค เคเฅ เคธเฅเคเฅ เคเฅ เค เคชเคกเฅเค เคเคฐเคจเฅ เคเฅ เคฒเคฟเค เคเคนเคพ เคเคพเคคเคพ เคนเฅ เคเฅ เคเคธเฅ เคเฅเคเคพเคค เคนเฅเค เคเคฐ เคเคจ เคคเคฅเฅเคฏเฅเค เคเฅ เคเคงเคพเคฐ เคชเคฐ เคเคธเฅ เค เคเคฒเฅ เคเคฆเคฎเฅเค เคเฅ เคฌเคพเคฐเฅ เคฎเฅเค เคตเคฟเคเคพเคฐ เคเคฐเคจเคพ เคนเฅเคคเคพ เคนเฅเฅค
```py
-from smolagents import load_tool, CodeAgent, HfApiModel, DuckDuckGoSearchTool
+from smolagents import load_tool, CodeAgent, InferenceClientModel, DuckDuckGoSearchTool
from dotenv import load_dotenv
load_dotenv()
@@ -267,7 +409,7 @@ search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(
tools=[search_tool],
- model=HfApiModel("Qwen/Qwen2.5-72B-Instruct"),
+ model=InferenceClientModel(model_id="Qwen/Qwen2.5-72B-Instruct"),
planning_interval=3 # This is where you activate planning!
)
diff --git a/docs/source/hi/tutorials/inspect_runs.mdx b/docs/source/hi/tutorials/inspect_runs.mdx
index 0669c4dcc..127bca148 100644
--- a/docs/source/hi/tutorials/inspect_runs.mdx
+++ b/docs/source/hi/tutorials/inspect_runs.mdx
@@ -1,18 +1,3 @@
-
# OpenTelemetry เคเฅ เคธเคพเคฅ runs เคเคพ เคจเคฟเคฐเฅเคเฅเคทเคฃ
[[open-in-colab]]
@@ -73,10 +58,10 @@ from smolagents import (
ToolCallingAgent,
DuckDuckGoSearchTool,
VisitWebpageTool,
- HfApiModel,
+ InferenceClientModel,
)
-model = HfApiModel()
+model = InferenceClientModel()
managed_agent = ToolCallingAgent(
tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
diff --git a/docs/source/hi/tutorials/secure_code_execution.mdx b/docs/source/hi/tutorials/secure_code_execution.mdx
index ad2cd8c34..73719e842 100644
--- a/docs/source/hi/tutorials/secure_code_execution.mdx
+++ b/docs/source/hi/tutorials/secure_code_execution.mdx
@@ -1,18 +1,3 @@
-
# เคธเฅเคฐเคเฅเคทเคฟเคค เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ
[[open-in-colab]]
@@ -41,7 +26,7 @@ rendered properly in your Markdown viewer.
### เคฒเฅเคเคฒ เคชเคพเคฏเคฅเคจ เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ
เคกเคฟเคซเคผเฅเคฒเฅเค เคฐเฅเคช เคธเฅ, `CodeAgent` LLM-เคเคจเคฐเฅเคเฅเคก เคเฅเคก เคเฅ เคเคชเคเฅ เคเคจเคตเคพเคฏเคฐเคจเคฎเฅเคเค เคฎเฅเค เคเคฒเคพเคคเคพ เคนเฅเฅค
-เคฏเคน เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ เคตเฅเคจเคฟเคฒเคพ เคชเคพเคฏเคฅเคจ เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ เคฆเฅเคตเคพเคฐเคพ เคจเคนเฅเค เคเคฟเคฏเคพ เคเคพเคคเคพ: เคนเคฎเคจเฅ เคเค เค เคงเคฟเค เคธเฅเคฐเคเฅเคทเคฟเคค `LocalPythonInterpreter` เคเฅ เคถเฅเคฐเฅ เคธเฅ เคซเคฟเคฐ เคธเฅ เคฌเคจเคพเคฏเคพ เคนเฅเฅค
+เคฏเคน เคเคเฅเคเฅเคเฅเคฏเฅเคถเคจ เคตเฅเคจเคฟเคฒเคพ เคชเคพเคฏเคฅเคจ เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ เคฆเฅเคตเคพเคฐเคพ เคจเคนเฅเค เคเคฟเคฏเคพ เคเคพเคคเคพ: เคนเคฎเคจเฅ เคเค เค เคงเคฟเค เคธเฅเคฐเคเฅเคทเคฟเคค `LocalPythonExecutor` เคเฅ เคถเฅเคฐเฅ เคธเฅ เคซเคฟเคฐ เคธเฅ เคฌเคจเคพเคฏเคพ เคนเฅเฅค
เคฏเคน เคเคเคเคฐเคชเฅเคฐเฅเคเคฐ เคธเฅเคฐเคเฅเคทเคพ เคเฅ เคฒเคฟเค เคกเคฟเคเคผเคพเคเคจ เคเคฟเคฏเคพ เคเคฏเคพ เคนเฅ:
- เคเคฎเฅเคชเฅเคฐเฅเคเฅเคธ เคเฅ เคเคชเคฏเฅเคเคเคฐเฅเคคเคพ เคฆเฅเคตเคพเคฐเคพ เคธเฅเคชเคทเฅเค เคฐเฅเคช เคธเฅ เคชเคพเคธ เคเฅ เคเค เคธเฅเคเฅ เคคเค เคธเฅเคฎเคฟเคค เคเคฐเคจเคพ
- เคเคจเคซเคฟเคจเคฟเค เคฒเฅเคชเฅเคธ เคเคฐ เคฐเคฟเคธเฅเคฐเฅเคธ เคฌเฅเคฒเฅเคเคฟเคเค เคเฅ เคฐเฅเคเคจเฅ เคเฅ เคฒเคฟเค เคเคชเคฐเฅเคถเคเคธ เคเฅ เคธเคเคเฅเคฏเคพ เคเฅ เคเฅเคช เคเคฐเคจเคพ
@@ -64,16 +49,16 @@ rendered properly in your Markdown viewer.
เค เคฌ เคเคช เคคเฅเคฏเคพเคฐ เคนเฅเค!
-เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเคเคฐ เคเฅ E2B เคชเคฐ เคธเฅเค เคเคฐเคจเฅ เคเฅ เคฒเคฟเค, เคฌเคธ เค เคชเคจเฅ `CodeAgent` เคเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเคผ เคเคฐเคคเฅ เคธเคฎเคฏ `use_e2b_executor=True` เคซเฅเคฒเฅเค เคชเคพเคธ เคเคฐเฅเคเฅค
+เคเฅเคก เคเคเฅเคเฅเคเฅเคฏเฅเคเคฐ เคเฅ E2B เคชเคฐ เคธเฅเค เคเคฐเคจเฅ เคเฅ เคฒเคฟเค, เคฌเคธ เค เคชเคจเฅ `CodeAgent` เคเฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเคผ เคเคฐเคคเฅ เคธเคฎเคฏ `executor_type="e2b"` เคซเฅเคฒเฅเค เคชเคพเคธ เคเคฐเฅเคเฅค
เคงเฅเคฏเคพเคจ เคฆเฅเค เคเคฟ เคเคชเคเฅ `additional_authorized_imports` เคฎเฅเค เคธเคญเฅ เคเฅเคฒ เคเฅ เคกเคฟเคชเฅเคเคกเฅเคเคธเฅเคเคผ เคเฅเคกเคผเคจเฅ เคเคพเคนเคฟเค, เคคเคพเคเคฟ เคเคเฅเคเฅเคเฅเคฏเฅเคเคฐ เคเคจเฅเคนเฅเค เคเคเคธเฅเคเฅเคฒ เคเคฐเฅเฅค
```py
-from smolagents import CodeAgent, VisitWebpageTool, HfApiModel
+from smolagents import CodeAgent, VisitWebpageTool, InferenceClientModel
agent = CodeAgent(
tools = [VisitWebpageTool()],
- model=HfApiModel(),
+ model=InferenceClientModel(),
additional_authorized_imports=["requests", "markdownify"],
- use_e2b_executor=True
+ executor_type="e2b"
)
agent.run("What was Abraham Lincoln's preferred pet?")
diff --git a/docs/source/hi/tutorials/tools.mdx b/docs/source/hi/tutorials/tools.mdx
index bb56d7bfc..2695217d2 100644
--- a/docs/source/hi/tutorials/tools.mdx
+++ b/docs/source/hi/tutorials/tools.mdx
@@ -1,18 +1,3 @@
-
# Tools
[[open-in-colab]]
@@ -134,9 +119,9 @@ image_generation_tool("A sunny beach")
เคซเคฟเคฐ เคเคช เคเคธ เคเฅเคฒ เคเคพ เคเคชเคฏเฅเค เคเคฟเคธเฅ เค เคจเฅเคฏ เคเฅเคฒ เคเฅ เคคเคฐเคน เคเคฐ เคธเคเคคเฅ เคนเฅเคเฅค เคเคฆเคพเคนเคฐเคฃ เคเฅ เคฒเคฟเค, เคเคฒเคฟเค เคชเฅเคฐเฅเคฎเฅเคชเฅเค `a rabbit wearing a space suit` เคเฅ เคธเฅเคงเคพเคฐเฅเค เคเคฐ เคเคธเคเฅ เคเค เคเคฎเฅเค เคเคจเคฐเฅเค เคเคฐเฅเคเฅค เคฏเคน เคเคฆเคพเคนเคฐเคฃ เคฏเคน เคญเฅ เคฆเคฟเคเคพเคคเคพ เคนเฅ เคเคฟ เคเคช เคเคเฅเคเค เคเฅ เค เคคเคฟเคฐเคฟเคเฅเคค เคเคฐเฅเคเฅเคฏเฅเคฎเฅเคเคเฅเคธ เคเฅเคธเฅ เคชเคพเคธ เคเคฐ เคธเคเคคเฅ เคนเฅเคเฅค
```python
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
-model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
agent = CodeAgent(tools=[image_generation_tool], model=model)
agent.run(
@@ -182,9 +167,9 @@ agent.run("How many more blocks (also denoted as layers) are in BERT base encode
เคเคฒเคฟเค เคเฅเคตเคฒ เคกเคฟเคซเคผเฅเคฒเฅเค เคเฅเคฒเคฌเฅเคเฅเคธ เคเฅ เคธเคพเคฅ เคเคจเคฟเคถเคฟเคฏเคฒเคพเคเคเคผ เคเคฟเค เคเค เคฎเฅเคเฅเคฆเคพ เคเคเฅเคเค เคฎเฅเค `model_download_tool` เคเฅเคกเคผเฅเคเฅค
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
-model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
agent.tools[model_download_tool.name] = model_download_tool
@@ -241,7 +226,7 @@ server_parameters = StdioServerParameters(
env={"UV_PYTHON": "3.12", **os.environ},
)
-with ToolCollection.from_mcp(server_parameters) as tool_collection:
+with ToolCollection.from_mcp(server_parameters, trust_remote_code=True) as tool_collection:
agent = CodeAgent(tools=[*tool_collection.tools], add_base_tools=True)
agent.run("Please find a remedy for hangover.")
```
\ No newline at end of file
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
index 4da8f4859..5ebe325c9 100644
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -8,10 +8,14 @@
sections:
- local: tutorials/building_good_agents
title: โจ ๆๅปบๅฅฝ็จ็ agents
+ - local: tutorials/inspect_runs
+ title: ๐ ็ๆง Agent ็่ฟ่ก
- local: tutorials/tools
title: ๐ ๏ธ ๅทฅๅ ท - ๆทฑๅบฆๆๅ
- local: tutorials/secure_code_execution
title: ๐ก๏ธ ไฝฟ็จ E2B ไฟๆคไฝ ็ไปฃ็ ๆง่ก
+ - local: tutorials/memory
+ title: ๐ ็ฎก็ Agent ็่ฎฐๅฟ
- title: Conceptual guides
sections:
- local: conceptual_guides/intro_agents
@@ -21,14 +25,18 @@
- title: Examples
sections:
- local: examples/text_to_sql
- title: Self-correcting Text-to-SQL
+ title: ่ชๆไฟฎๆญฃ Text-to-SQL
- local: examples/rag
- title: Master you knowledge base with agentic RAG
+ title: ๅๅฉ agentic RAG ๆๆง็ฅ่ฏๅบ
- local: examples/multiagents
- title: Orchestrate a multi-agent system
+ title: ็ผๆ multi-agent ็ณป็ป
+ - local: examples/web_browser
+ title: ๅบไบ่ง่งๆจกๅๆๅปบ่ฝๅคๆต่ง็ฝ้กต็agent
- title: Reference
sections:
- local: reference/agents
title: Agent-related objects
+ - local: reference/models
+ title: Model-related objects
- local: reference/tools
title: Tool-related objects
diff --git a/docs/source/zh/conceptual_guides/intro_agents.mdx b/docs/source/zh/conceptual_guides/intro_agents.mdx
index 416aabcb5..6b09349e4 100644
--- a/docs/source/zh/conceptual_guides/intro_agents.mdx
+++ b/docs/source/zh/conceptual_guides/intro_agents.mdx
@@ -1,19 +1,3 @@
-
-
# Agent ็ฎไป
> [!TIP]
diff --git a/docs/source/zh/conceptual_guides/react.mdx b/docs/source/zh/conceptual_guides/react.mdx
index cdb970728..44760fb0c 100644
--- a/docs/source/zh/conceptual_guides/react.mdx
+++ b/docs/source/zh/conceptual_guides/react.mdx
@@ -1,18 +1,3 @@
-
# ๅคๆญฅ้ชค agent ๆฏๅฆไฝๅทฅไฝ็๏ผ
ReAct ๆกๆถ๏ผ[Yao et al., 2022](https://huggingface.co/papers/2210.03629)๏ผๆฏ็ฎๅๆๅปบ agent ็ไธป่ฆๆนๆณใ
diff --git a/docs/source/zh/examples/multiagents.mdx b/docs/source/zh/examples/multiagents.mdx
index 3b177d133..567e7573f 100644
--- a/docs/source/zh/examples/multiagents.mdx
+++ b/docs/source/zh/examples/multiagents.mdx
@@ -1,18 +1,3 @@
-
# ็ผๆ multi-agent ็ณป็ป ๐ค๐ค๐ค
[[open-in-colab]]
@@ -53,7 +38,7 @@ login()
```
โก๏ธ HF็Inference API ๅฏไปฅๅฟซ้่ฝปๆพๅฐ่ฟ่กไปปไฝๅผๆบๆจกๅ๏ผๅ ๆญคๆไปฌ็agentๅฐไฝฟ็จHF็Inference API
-ไธญ็`HfApiModel`็ฑปๆฅ่ฐ็จ
+ไธญ็`InferenceClientModel`็ฑปๆฅ่ฐ็จ
[Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)ๆจกๅใ
_Note:_ ๅบไบๅคๅๆฐๅ้จ็ฝฒๆจกๅ็ Inference API ๅฏ่ฝๅจๆฒกๆ้ขๅ ้็ฅ็ๆ ๅตไธๆดๆฐๆๆฟๆขๆจกๅใไบ่งฃๆดๅคไฟกๆฏ๏ผ่ฏทๅ้ [่ฟ้](https://huggingface.co/docs/api-inference/supported-models)ใ
@@ -127,13 +112,13 @@ print(visit_webpage("https://en.wikipedia.org/wiki/Hugging_Face")[:500])
from smolagents import (
CodeAgent,
ToolCallingAgent,
- HfApiModel,
+ InferenceClientModel,
ManagedAgent,
DuckDuckGoSearchTool,
LiteLLMModel,
)
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
web_agent = ToolCallingAgent(
tools=[DuckDuckGoSearchTool(), visit_webpage],
diff --git a/docs/source/zh/examples/rag.mdx b/docs/source/zh/examples/rag.mdx
index 23efa9e0e..bed9b7fb6 100644
--- a/docs/source/zh/examples/rag.mdx
+++ b/docs/source/zh/examples/rag.mdx
@@ -1,18 +1,3 @@
-
# Agentic RAG
[[open-in-colab]]
@@ -38,7 +23,7 @@ Retrieval-Augmented-Generation (RAG) ๆฏโไฝฟ็จๅคง่ฏญ่จๆจกๅ๏ผLLM๏ผๆฅๅ
!pip install smolagents pandas langchain langchain-community sentence-transformers rank_bm25 --upgrade -q
```
-ไฝ ้่ฆไธไธชๆๆ็ token ไฝไธบ็ฏๅขๅ้ `HF_TOKEN` ๆฅ่ฐ็จ HF Inference APIใๆไปฌไฝฟ็จ python-dotenv ๆฅๅ ่ฝฝๅฎใ
+ไฝ ้่ฆไธไธชๆๆ็ token ไฝไธบ็ฏๅขๅ้ `HF_TOKEN` ๆฅ่ฐ็จ Inference Providersใๆไปฌไฝฟ็จ python-dotenv ๆฅๅ ่ฝฝๅฎใ
```py
from dotenv import load_dotenv
load_dotenv()
@@ -126,10 +111,10 @@ BM25 ๆฃ็ดขๆนๆณๆฏไธไธช็ปๅ ธ็ๆฃ็ดขๆนๆณ๏ผๅ ไธบๅฎ็่ฎพ็ฝฎ้ๅบฆ้ๅธธ
_Note:_ ๆญค Inference API ๆ็ฎกๅบไบๅ็งๆ ๅ็ๆจกๅ๏ผ้จ็ฝฒ็ๆจกๅๅฏ่ฝไผๅจๆฒกๆไบๅ ้็ฅ็ๆ ๅตไธ่ฟ่กๆดๆฐๆๆฟๆขใไบ่งฃๆดๅคไฟกๆฏ๏ผ่ฏท็นๅป[่ฟ้](https://huggingface.co/docs/api-inference/supported-models)ใ
```py
-from smolagents import HfApiModel, CodeAgent
+from smolagents import InferenceClientModel, CodeAgent
agent = CodeAgent(
- tools=[retriever_tool], model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbose=True
+ tools=[retriever_tool], model=InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbose=True
)
```
diff --git a/docs/source/zh/examples/text_to_sql.mdx b/docs/source/zh/examples/text_to_sql.mdx
index 419c45159..349d31f6f 100644
--- a/docs/source/zh/examples/text_to_sql.mdx
+++ b/docs/source/zh/examples/text_to_sql.mdx
@@ -1,18 +1,3 @@
-
# Text-to-SQL
[[open-in-colab]]
@@ -121,14 +106,14 @@ def sql_engine(query: str) -> str:
ๆไปฌ็ฐๅจไฝฟ็จ่ฟไธชๅทฅๅ ทๆฅๅๅปบไธไธช agentใๆไปฌไฝฟ็จ `CodeAgent`๏ผ่ฟๆฏ smolagent ็ไธป่ฆ agent ็ฑป๏ผไธไธชๅจไปฃ็ ไธญ็ผๅๆไฝๅนถๆ นๆฎ ReAct ๆกๆถ่ฟญไปฃๅ ๅ่พๅบ็ agentใ
-่ฟไธชๆจกๅๆฏ้ฉฑๅจ agent ็ณป็ป็ LLMใ`HfApiModel` ๅ ่ฎธไฝ ไฝฟ็จ HF Inference API ่ฐ็จ LLM๏ผๆ ่ฎบๆฏ้่ฟ Serverless ่ฟๆฏ Dedicated endpoint๏ผไฝไฝ ไนๅฏไปฅไฝฟ็จไปปไฝไธๆ APIใ
+่ฟไธชๆจกๅๆฏ้ฉฑๅจ agent ็ณป็ป็ LLMใ`InferenceClientModel` ๅ ่ฎธไฝ ไฝฟ็จ HF Inference API ่ฐ็จ LLM๏ผๆ ่ฎบๆฏ้่ฟ Serverless ่ฟๆฏ Dedicated endpoint๏ผไฝไฝ ไนๅฏไปฅไฝฟ็จไปปไฝไธๆ APIใ
```py
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("meta-llama/Meta-Llama-3.1-8B-Instruct"),
+ model=InferenceClientModel(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"),
)
agent.run("Can you give me the name of the client who got the most expensive receipt?")
```
@@ -184,7 +169,7 @@ sql_engine.description = updated_description
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct"),
+ model=InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"),
)
agent.run("Which waiter got more total money from tips?")
diff --git a/docs/source/zh/examples/web_browser.mdx b/docs/source/zh/examples/web_browser.mdx
new file mode 100644
index 000000000..cf65225ed
--- /dev/null
+++ b/docs/source/zh/examples/web_browser.mdx
@@ -0,0 +1,214 @@
+# ไฝฟ็จAgentๅฎ็ฐ็ฝ้กตๆต่งๅจ่ชๅจๅ ๐ค๐
+
+[[open-in-colab]]
+
+ๅจๆฌnotebookไธญ๏ผๆไปฌๅฐๅๅปบไธไธช**ๅบไบAgent็็ฝ้กตๆต่งๅจ่ชๅจๅ็ณป็ป**๏ผ่ฏฅ็ณป็ปๅฏไปฅ่ชๅจๅฏผ่ช็ฝ็ซใไธ็ฝ้กตๅ ็ด ไบคไบๅนถๆๅไฟกๆฏใ
+
+่ฏฅAgentๅฐ่ฝๅค๏ผ
+
+- [x] ๅฏผ่ชๅฐ็ฝ้กต
+- [x] ็นๅปๅ ็ด
+- [x] ๅจ้กต้ขๅ ๆ็ดข
+- [x] ๅค็ๅผนๅบ็ชๅฃๅๆจกๆๆก
+- [x] ๆๅไฟกๆฏ
+
+่ฎฉๆไปฌไธๆญฅๆญฅๆญๅปบ่ฟไธช็ณป็ป๏ผ
+
+้ฆๅ ่ฟ่กไปฅไธๅฝไปคๅฎ่ฃ ๆ้ไพ่ต๏ผ
+
+```bash
+pip install smolagents selenium helium pillow -q
+```
+
+่ฎฉๆไปฌๅฏผๅ ฅๆ้็ๅบๅนถ่ฎพ็ฝฎ็ฏๅขๅ้๏ผ
+
+```python
+from io import BytesIO
+from time import sleep
+
+import helium
+from dotenv import load_dotenv
+from PIL import Image
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+
+from smolagents import CodeAgent, tool
+from smolagents.agents import ActionStep
+
+# Load environment variables
+load_dotenv()
+```
+
+็ฐๅจๆไปฌๆฅๅๅปบๆ ธๅฟ็ๆต่งๅจไบคไบๅทฅๅ ท๏ผไฝฟๆไปฌ็Agent่ฝๅคๅฏผ่ชๅนถไธ็ฝ้กตไบคไบ๏ผ
+
+```python
+@tool
+def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
+ """
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
+ Args:
+ text: The text to search for
+ nth_result: Which occurrence to jump to (default: 1)
+ """
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
+ if nth_result > len(elements):
+ raise Exception(f"Match nยฐ{nth_result} not found (only {len(elements)} matches found)")
+ result = f"Found {len(elements)} matches for '{text}'."
+ elem = elements[nth_result - 1]
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
+ result += f"Focused on element {nth_result} of {len(elements)}"
+ return result
+
+@tool
+def go_back() -> None:
+ """Goes back to previous page."""
+ driver.back()
+
+@tool
+def close_popups() -> str:
+ """
+ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
+ This does not work on cookie consent banners.
+ """
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
+```
+
+่ฎฉๆไปฌ้ ็ฝฎไฝฟ็จChromeๆต่งๅจๅนถ่ฎพ็ฝฎๆชๅพๅ่ฝ๏ผ
+
+```python
+# Configure Chrome options
+chrome_options = webdriver.ChromeOptions()
+chrome_options.add_argument("--force-device-scale-factor=1")
+chrome_options.add_argument("--window-size=1000,1350")
+chrome_options.add_argument("--disable-pdf-viewer")
+chrome_options.add_argument("--window-position=0,0")
+
+# Initialize the browser
+driver = helium.start_chrome(headless=False, options=chrome_options)
+
+# Set up screenshot callback
+def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
+ driver = helium.get_driver()
+ current_step = memory_step.step_number
+ if driver is not None:
+ for previous_memory_step in agent.memory.steps: # Remove previous screenshots for lean processing
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
+ previous_memory_step.observations_images = None
+ png_bytes = driver.get_screenshot_as_png()
+ image = Image.open(BytesIO(png_bytes))
+ print(f"Captured a browser screenshot: {image.size} pixels")
+ memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists
+
+ # Update observations with current URL
+ url_info = f"Current url: {driver.current_url}"
+ memory_step.observations = (
+ url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
+ )
+```
+
+็ฐๅจๆไปฌๆฅๅๅปบ็ฝ้กต่ชๅจๅAgent๏ผ
+
+```python
+from smolagents import InferenceClientModel
+
+# Initialize the model
+model_id = "meta-llama/Llama-3.3-70B-Instruct" # You can change this to your preferred model
+model = InferenceClientModel(model_id=model_id)
+
+# Create the agent
+agent = CodeAgent(
+ tools=[go_back, close_popups, search_item_ctrl_f],
+ model=model,
+ additional_authorized_imports=["helium"],
+ step_callbacks=[save_screenshot],
+ max_steps=20,
+ verbosity_level=2,
+)
+
+# Import helium for the agent
+agent.python_executor("from helium import *", agent.state)
+```
+
+Agent้่ฆ่ทๅพๅ ณไบๅฆไฝไฝฟ็จHelium่ฟ่ก็ฝ้กต่ชๅจๅ็ๆๅฏผใไปฅไธๆฏๆไปฌๅฐๆไพ็ๆไฝ่ฏดๆ๏ผ
+
+```python
+helium_instructions = """
+You can use helium to access websites. Don't bother about the helium driver, it's already managed.
+We've already ran "from helium import *"
+Then you can go to pages!
+Code:
+```py
+go_to('github.com/trending')
+```
+
+You can directly click clickable elements by inputting the text that appears on them.
+Code:
+```py
+click("Top products")
+```
+
+If it's a link:
+Code:
+```py
+click(Link("Top products"))
+```
+
+If you try to interact with an element and it's not found, you'll get a LookupError.
+In general stop your action after each button click to see what happens on your screenshot.
+Never try to login in a page.
+
+To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
+Code:
+```py
+scroll_down(num_pixels=1200) # This will scroll one viewport down
+```
+
+When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
+Just use your built-in tool `close_popups` to close them:
+Code:
+```py
+close_popups()
+```
+
+You can use .exists() to check for the existence of an element. For example:
+Code:
+```py
+if Text('Accept cookies?').exists():
+ click('I accept')
+```
+"""
+```
+
+็ฐๅจๆไปฌๅฏไปฅ่ฟ่กAgentๆง่กไปปๅกไบ๏ผ่ฎฉๆไปฌๅฐ่ฏๅจ็ปดๅบ็พ็งไธๆฅๆพไฟกๆฏ๏ผ
+
+```python
+search_request = """
+Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
+"""
+
+agent_output = agent.run(search_request + helium_instructions)
+print("Final output:")
+print(agent_output)
+```
+
+ๆจๅฏไปฅ้่ฟไฟฎๆน่ฏทๆฑๅๆฐๆง่กไธๅไปปๅกใไพๅฆ๏ผไปฅไธ่ฏทๆฑๅฏๅธฎๅฉๆๅคๆญๆฏๅฆ้่ฆๆดๅ ๅชๅๅทฅไฝ๏ผ
+
+```python
+github_request = """
+I'm trying to find how hard I have to work to get a repo in github.com/trending.
+Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
+"""
+
+agent_output = agent.run(github_request + helium_instructions)
+print("Final output:")
+print(agent_output)
+```
+
+่ฏฅ็ณป็ปๅจไปฅไธไปปๅกไธญๅฐคไธบๆๆ๏ผ
+
+- ไป็ฝ็ซๆๅๆฐๆฎ
+- ็ฝ้กต็ ็ฉถ่ชๅจๅ
+- ็จๆท็้ขๆต่ฏไธ้ช่ฏ
+- ๅ ๅฎน็ๆง
\ No newline at end of file
diff --git a/docs/source/zh/guided_tour.mdx b/docs/source/zh/guided_tour.mdx
index 54ae10419..e851b79b8 100644
--- a/docs/source/zh/guided_tour.mdx
+++ b/docs/source/zh/guided_tour.mdx
@@ -1,18 +1,3 @@
-
# Agents - ๅฏผ่ง
[[open-in-colab]]
@@ -31,26 +16,28 @@ rendered properly in your Markdown viewer.
- `model`๏ผไธไธชไธบๆจ็ agent ๆไพๅจๅ็ๆๆฌ็ๆๆจกๅ - ๅ ไธบ agent ไธ็ฎๅ็ LLM ไธๅ๏ผๅฎๆฏไธไธชไฝฟ็จ LLM ไฝไธบๅผๆ็็ณป็ปใๆจๅฏไปฅไฝฟ็จไปฅไธไปปไธ้้กน๏ผ
- [`TransformersModel`] ไฝฟ็จ้ขๅๅงๅ็ `transformers` ็ฎก้ๅจๆฌๅฐๆบๅจไธ่ฟ่กๆจ็
- - [`HfApiModel`] ๅจๅบๅฑไฝฟ็จ `huggingface_hub.InferenceClient`
+ - [`InferenceClientModel`] ๅจๅบๅฑไฝฟ็จ `huggingface_hub.InferenceClient`
- [`LiteLLMModel`] ่ฎฉๆจ้่ฟ [LiteLLM](https://docs.litellm.ai/) ่ฐ็จ 100+ ไธๅ็ๆจกๅ๏ผ
+ - [`AzureOpenAIServerModel`] ๅ ่ฎธๆจไฝฟ็จ้จ็ฝฒๅจ [Azure](https://azure.microsoft.com/en-us/products/ai-services/openai-service) ไธญ็ OpenAI ๆจกๅใ
+ - [`MLXModel`] ๅฏๅๅปบ [mlx-lm](https://pypi.org/project/mlx-lm/) ๆตๆฐด็บฟ๏ผไปฅไพฟๅจๆฌๅฐๆบๅจไธ่ฟ่กๆจ็ใ
- `tools`๏ผagent ๅฏไปฅ็จๆฅ่งฃๅณไปปๅก็ `Tools` ๅ่กจใๅฎๅฏไปฅๆฏไธไธช็ฉบๅ่กจใๆจ่ฟๅฏไปฅ้่ฟๅฎไนๅฏ้ๅๆฐ `add_base_tools=True` ๅจๆจ็ `tools` ๅ่กจไนไธๆทปๅ ้ป่ฎคๅทฅๅ ท็ฎฑใ
-ไธๆฆๆไบ่ฟไธคไธชๅๆฐ `tools` ๅ `model`๏ผๆจๅฐฑๅฏไปฅๅๅปบไธไธช agent ๅนถ่ฟ่กๅฎใๆจๅฏไปฅไฝฟ็จไปปไฝๆจๅๆฌข็ LLM๏ผๆ ่ฎบๆฏ้่ฟ [Hugging Face API](https://huggingface.co/docs/api-inference/en/index)ใ[transformers](https://github.com/huggingface/transformers/)ใ[ollama](https://ollama.com/)๏ผ่ฟๆฏ [LiteLLM](https://www.litellm.ai/)ใ
+ไธๆฆๆไบ่ฟไธคไธชๅๆฐ `tools` ๅ `model`๏ผๆจๅฐฑๅฏไปฅๅๅปบไธไธช agent ๅนถ่ฟ่กๅฎใๆจๅฏไปฅไฝฟ็จไปปไฝๆจๅๆฌข็ LLM๏ผๆ ่ฎบๆฏ้่ฟ [Hugging Face API](https://huggingface.co/docs/api-inference/en/index)ใ[transformers](https://github.com/huggingface/transformers/)ใ[ollama](https://ollama.com/)ใ[LiteLLM](https://www.litellm.ai/)ใ[Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service)๏ผ่ฟๆฏ[mlx-lm](https://pypi.org/project/mlx-lm/).ใ
Hugging Face API ๅฏไปฅๅ ่ดนไฝฟ็จ่ๆ ้ token๏ผไฝไผๆ้็้ๅถใ
-่ฆ่ฎฟ้ฎๅ้ๆจกๅๆไฝฟ็จ PRO ่ดฆๆทๆ้ซ้็้ๅถ๏ผๆจ้่ฆ่ฎพ็ฝฎ็ฏๅขๅ้ `HF_TOKEN` ๆๅจๅๅงๅ `HfApiModel` ๆถไผ ้ `token` ๅ้ใ
+่ฆ่ฎฟ้ฎๅ้ๆจกๅๆไฝฟ็จ PRO ่ดฆๆทๆ้ซ้็้ๅถ๏ผๆจ้่ฆ่ฎพ็ฝฎ็ฏๅขๅ้ `HF_TOKEN` ๆๅจๅๅงๅ `InferenceClientModel` ๆถไผ ้ `token` ๅ้ใ
```python
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = HfApiModel(model_id=model_id, token="")
+model = InferenceClientModel(model_id=model_id, token="")
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
agent.run(
@@ -109,6 +96,62 @@ agent.run(
"Could you give me the 118th number in the Fibonacci sequence?",
)
```
+
+
+
+่ฆ่ฟๆฅๅฐ Azure OpenAI๏ผๆจๅฏไปฅ็ดๆฅไฝฟ็จ `AzureOpenAIServerModel`๏ผๆไฝฟ็จ `LiteLLMModel` ๅนถ่ฟ่ก็ธๅบ้ ็ฝฎใ
+
+ๅๅงๅ `AzureOpenAIServerModel` ๅฎไพๆถ๏ผ้่ฆไผ ้ๆจกๅ้จ็ฝฒๅ็งฐ๏ผๅฏ้ๆฉไปฅไธไปปไธ็งๆนๅผ๏ผ1.ไผ ้ `azure_endpoint`ใ`api_key` ๅ `api_version` ๅๆฐ๏ผ2.่ฎพ็ฝฎ็ฏๅขๅ้ `AZURE_OPENAI_ENDPOINT`ใ`AZURE_OPENAI_API_KEY` ๅ `OPENAI_API_VERSION`
+
+```python
+# !pip install smolagents[openai]
+from smolagents import CodeAgent, AzureOpenAIServerModel
+
+model = AzureOpenAIServerModel(model_id="gpt-4o-mini")
+agent = CodeAgent(tools=[], model=model, add_base_tools=True)
+
+agent.run(
+ "Could you give me the 118th number in the Fibonacci sequence?",
+)
+```
+
+ไนๅฏๆๅฆไธๆนๅผ้ ็ฝฎ `LiteLLMModel` ่ฟๆฅ Azure OpenAI๏ผ
+
+- ๅฐๆจกๅ้จ็ฝฒๅ็งฐไฝไธบ `model_id` ๅๆฐไผ ้๏ผๅนถ็กฎไฟๅ ถๅ็ผไธบ `azure/`
+- ็กฎไฟ่ฎพ็ฝฎ็ฏๅขๅ้ `AZURE_API_VERSION`
+- ไปป้ๅ ถไธ๏ผ1.ไผ ้ `api_base` ๅ `api_key` ๅๆฐ๏ผ2.่ฎพ็ฝฎ็ฏๅขๅ้ `AZURE_API_KEY` ๅ `AZURE_API_BASE`
+
+```python
+import os
+from smolagents import CodeAgent, LiteLLMModel
+
+AZURE_OPENAI_CHAT_DEPLOYMENT_NAME="gpt-35-turbo-16k-deployment" # example of deployment name
+
+os.environ["AZURE_API_KEY"] = "" # api_key
+os.environ["AZURE_API_BASE"] = "" # "https://example-endpoint.openai.azure.com"
+os.environ["AZURE_API_VERSION"] = "" # "2024-10-01-preview"
+
+model = LiteLLMModel(model_id="azure/" + AZURE_OPENAI_CHAT_DEPLOYMENT_NAME)
+agent = CodeAgent(tools=[], model=model, add_base_tools=True)
+
+agent.run(
+ "Could you give me the 118th number in the Fibonacci sequence?",
+)
+```
+
+
+
+
+```python
+# !pip install smolagents[mlx-lm]
+from smolagents import CodeAgent, MLXModel
+
+mlx_model = MLXModel("mlx-community/Qwen2.5-Coder-32B-Instruct-4bit")
+agent = CodeAgent(model=mlx_model, tools=[], add_base_tools=True)
+
+agent.run("Could you give me the 118th number in the Fibonacci sequence?")
+```
+
@@ -125,6 +168,7 @@ Python ่งฃ้ๅจ้ป่ฎคไนไธๅ ่ฎธๅจๅฎๅ จๅ่กจไนๅคๅฏผๅ ฅ๏ผๆไปฅๆๆๆ
```py
from smolagents import CodeAgent
+model = InferenceClientModel()
agent = CodeAgent(tools=[], model=model, additional_authorized_imports=['requests', 'bs4'])
agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?")
```
@@ -134,7 +178,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co
ๅฆๆ็ๆ็ไปฃ็ ๅฐ่ฏๆง่ก้ๆณๆไฝๆๅบ็ฐๅธธ่ง Python ้่ฏฏ๏ผๆง่กๅฐๅๆญขใ
-ๆจไนๅฏไปฅไฝฟ็จ [E2B ไปฃ็ ๆง่กๅจ](https://e2b.dev/docs#what-is-e2-b) ่ไธๆฏๆฌๅฐ Python ่งฃ้ๅจ๏ผ้ฆๅ [่ฎพ็ฝฎ `E2B_API_KEY` ็ฏๅขๅ้](https://e2b.dev/dashboard?tab=keys)๏ผ็ถๅๅจๅๅงๅ agent ๆถไผ ้ `use_e2b_executor=True`ใ
+ๆจไนๅฏไปฅไฝฟ็จ [E2B ไปฃ็ ๆง่กๅจ](https://e2b.dev/docs#what-is-e2-b) ๆ Docker ่ไธๆฏๆฌๅฐ Python ่งฃ้ๅจใๅฏนไบ E2B๏ผ้ฆๅ [่ฎพ็ฝฎ `E2B_API_KEY` ็ฏๅขๅ้](https://e2b.dev/dashboard?tab=keys)๏ผ็ถๅๅจๅๅงๅ agent ๆถไผ ้ `executor_type="e2b"`ใๅฏนไบ Docker๏ผๅจๅๅงๅๆถไผ ้ `executor_type="docker"`ใ
> [!TIP]
> ๅจ [่ฏฅๆ็จไธญ](tutorials/secure_code_execution) ไบ่งฃๆดๅคๅ ณไบไปฃ็ ๆง่ก็ๅ ๅฎนใ
@@ -168,7 +212,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co
### ้ป่ฎคๅทฅๅ ท็ฎฑ
-`smolagents` ้ๅธฆไบไธไธช็จไบๅขๅผบ agent ็้ป่ฎคๅทฅๅ ท็ฎฑ๏ผๆจๅฏไปฅๅจๅๅงๅๆถ้่ฟๅๆฐ `add_base_tools = True` ๅฐๅ ถๆทปๅ ๅฐๆจ็ agent ไธญ๏ผ
+`smolagents` ้ๅธฆไบไธไธช็จไบๅขๅผบ agent ็้ป่ฎคๅทฅๅ ท็ฎฑ๏ผๆจๅฏไปฅๅจๅๅงๅๆถ้่ฟๅๆฐ `add_base_tools=True` ๅฐๅ ถๆทปๅ ๅฐๆจ็ agent ไธญ๏ผ
- **DuckDuckGo ็ฝ้กตๆ็ดข**๏ผไฝฟ็จ DuckDuckGo ๆต่งๅจๆง่ก็ฝ้กตๆ็ดขใ
- **Python ไปฃ็ ่งฃ้ๅจ**๏ผๅจๅฎๅ จ็ฏๅขไธญ่ฟ่ก LLM ็ๆ็ Python ไปฃ็ ใๅชๆๅจไฝฟ็จ `add_base_tools=True` ๅๅงๅ [`ToolCallingAgent`] ๆถๆไผๆทปๅ ๆญคๅทฅๅ ท๏ผๅ ไธบๅบไบไปฃ็ ็ agent ๅทฒ็ปๅฏไปฅๅ็ๆง่ก Python ไปฃ็
@@ -260,8 +304,8 @@ class ModelDownloadTool(Tool):
็ถๅๆจๅฏไปฅ็ดๆฅๅๅงๅๆจ็ agent๏ผ
```py
-from smolagents import CodeAgent, HfApiModel
-agent = CodeAgent(tools=[model_download_tool], model=HfApiModel())
+from smolagents import CodeAgent, InferenceClientModel
+agent = CodeAgent(tools=[model_download_tool], model=InferenceClientModel())
agent.run(
"Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?"
)
@@ -274,7 +318,7 @@ agent.run(
โ Can you give me the name of the model that has the most downloads in the 'text-to-video' โ
โ task on the Hugging Face Hub? โ
โ โ
-โฐโ HfApiModel - Qwen/Qwen2.5-Coder-32B-Instruct โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
+โฐโ InferenceClientModel - Qwen/Qwen2.5-Coder-32B-Instruct โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ Step 0 โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
โญโ Executing this code: โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ
โ 1 model_name = model_download_tool(task="text-to-video") โ
@@ -311,9 +355,9 @@ Out[20]: 'ByteDance/AnimateDiff-Lightning'
ไปฅไธๆฏไธไธชไฝฟ็จๆไปฌ็ [`DuckDuckGoSearchTool`] ๅถไฝไธไธช็ฎก็็นๅฎ็ฝ้กตๆ็ดข agent ็ agent ็็คบไพ๏ผ
```py
-from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool, ManagedAgent
+from smolagents import CodeAgent, InferenceClientModel, DuckDuckGoSearchTool, ManagedAgent
-model = HfApiModel()
+model = InferenceClientModel()
web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)
@@ -342,14 +386,14 @@ manager_agent.run("Who is the CEO of Hugging Face?")
from smolagents import (
load_tool,
CodeAgent,
- HfApiModel,
+ InferenceClientModel,
GradioUI
)
# ไป Hub ๅฏผๅ ฅๅทฅๅ ท
image_generation_tool = load_tool("m-ric/text-to-image")
-model = HfApiModel(model_id)
+model = InferenceClientModel(model_id=model_id)
# ไฝฟ็จๅพๅ็ๆๅทฅๅ ทๅๅงๅ agent
agent = CodeAgent(tools=[image_generation_tool], model=model)
@@ -364,6 +408,18 @@ GradioUI(agent).launch()
## ไธไธๆญฅ
+ๆๅ๏ผๅฝๆจๆ้้ ็ฝฎๅฅฝagentๅ๏ผๅณๅฏๅฐๅ ถๅไบซ่ณ Hub๏ผ
+
+```py
+agent.push_to_hub("m-ric/my_agent")
+```
+
+็ฑปไผผๅฐ๏ผ่ฅ่ฆๅ ่ฝฝๅทฒๆจ้่ณ Hub ็agent๏ผๅจไฟกไปปๅ ถๅทฅๅ ทไปฃ็ ็ๅๆไธ๏ผๅฏไฝฟ็จ๏ผ
+
+```py
+agent.from_hub("m-ric/my_agent", trust_remote_code=True)
+```
+
่ฆๆดๆทฑๅ ฅๅฐไฝฟ็จ๏ผๆจๅฐ้่ฆๆฅ็ๆไปฌ็ๆ็จ๏ผ
- [ๆไปฌ็ไปฃ็ agent ๅฆไฝๅทฅไฝ็่งฃ้](./tutorials/secure_code_execution)
- [ๆฌๆๅๅ ณไบๅฆไฝๆๅปบๅฅฝ็ agent](./tutorials/building_good_agents)ใ
diff --git a/docs/source/zh/index.mdx b/docs/source/zh/index.mdx
index d79e8090c..08260bb91 100644
--- a/docs/source/zh/index.mdx
+++ b/docs/source/zh/index.mdx
@@ -1,18 +1,3 @@
-
-
# `smolagents`
่ฟๆฏๆๅปบๅผบๅคง agent ็ๆ็ฎๅๆกๆถ๏ผ้กบไพฟ้ฎไธไธ๏ผไปไนๆฏ "agent"๏ผๆไปฌๅจ[ๆญค้กต้ข](conceptual_guides/intro_agents)ๆไพไบๆไปฌ็ๅฎไน๏ผๆจ่ฟๅฏไปฅๆพๅฐๅ ณไบไฝๆถไฝฟ็จๆไธไฝฟ็จๅฎไปฌ็ๅปบ่ฎฎ๏ผๅง้๏ผ้ๅธธไธไฝฟ็จ agent ไผๆดๅฅฝ๏ผใ
diff --git a/docs/source/zh/reference/agents.mdx b/docs/source/zh/reference/agents.mdx
index bd7f3a779..c4fae3c5c 100644
--- a/docs/source/zh/reference/agents.mdx
+++ b/docs/source/zh/reference/agents.mdx
@@ -1,19 +1,3 @@
-
-
# Agents๏ผๆบ่ฝไฝ๏ผ
diff --git a/docs/source/zh/reference/models.mdx b/docs/source/zh/reference/models.mdx
index 79c9e72a4..036334140 100644
--- a/docs/source/zh/reference/models.mdx
+++ b/docs/source/zh/reference/models.mdx
@@ -1,19 +1,3 @@
-
-
# ๆจกๅ
@@ -71,24 +55,24 @@ print(model([{"role": "user", "content": [{"type": "text", "text": "Ok!"}]}], st
[[autodoc]] TransformersModel
-### HfApiModel
+### InferenceClientModel
-`HfApiModel` ๅฐ่ฃ ไบ huggingface_hub ็ [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/guides/inference)๏ผ็จไบๆง่ก LLMใๅฎๆฏๆ HF ็ [Inference API](https://huggingface.co/docs/api-inference/index) ไปฅๅ Hub ไธๆๆๅฏ็จ็[Inference Providers](https://huggingface.co/blog/inference-providers)ใ
+`InferenceClientModel` ๅฐ่ฃ ไบ huggingface_hub ็ [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/guides/inference)๏ผ็จไบๆง่ก LLMใๅฎๆฏๆ HF ็ [Inference API](https://huggingface.co/docs/api-inference/index) ไปฅๅ Hub ไธๆๆๅฏ็จ็[Inference Providers](https://huggingface.co/blog/inference-providers)ใ
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
messages = [
{"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]}
]
-model = HfApiModel()
+model = InferenceClientModel()
print(model(messages))
```
```text
>>> Of course! If you change your mind, feel free to reach out. Take care!
```
-[[autodoc]] HfApiModel
+[[autodoc]] InferenceClientModel
### LiteLLMModel
@@ -101,7 +85,7 @@ messages = [
{"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]}
]
-model = LiteLLMModel("anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10)
+model = LiteLLMModel(model_id="anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10)
print(model(messages))
```
diff --git a/docs/source/zh/reference/tools.mdx b/docs/source/zh/reference/tools.mdx
index 86f19dca4..9306eb322 100644
--- a/docs/source/zh/reference/tools.mdx
+++ b/docs/source/zh/reference/tools.mdx
@@ -1,19 +1,3 @@
-
-
# ๅทฅๅ ท
diff --git a/docs/source/zh/tutorials/building_good_agents.mdx b/docs/source/zh/tutorials/building_good_agents.mdx
index fbf489fae..a70d251ce 100644
--- a/docs/source/zh/tutorials/building_good_agents.mdx
+++ b/docs/source/zh/tutorials/building_good_agents.mdx
@@ -1,18 +1,3 @@
-
# ๆๅปบๅฅฝ็จ็ agent
[[open-in-colab]]
@@ -120,11 +105,11 @@ def get_weather_api(location: str, date_time: str) -> str:
้คไบ็ฎๅ็ไปปๅกๆ่ฟฐๅญ็ฌฆไธฒๅค๏ผไฝ ่ฟๅฏไปฅไฝฟ็จ `additional_args` ๅๆฐไผ ้ไปปไฝ็ฑปๅ็ๅฏน่ฑก๏ผ
```py
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
model_id = "meta-llama/Llama-3.3-70B-Instruct"
-agent = CodeAgent(tools=[], model=HfApiModel(model_id=model_id), add_base_tools=True)
+agent = CodeAgent(tools=[], model=InferenceClientModel(model_id=model_id), add_base_tools=True)
agent.run(
"Why does Mike not know many people in New York?",
@@ -209,13 +194,152 @@ In the end you have to return a final answer using the `final_answer` tool.
Here are a few examples using notional tools:
---
-{examples}
+Task: "Generate an image of the oldest person in this document."
-Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer.
+Code:
+```py
+answer = document_qa(document=document, question="Who is the oldest person mentioned?")
+print(answer)
+```
+Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland."
-{{tool_descriptions}}
+Thought: I will now generate an image showcasing the oldest person.
+Code:
+```py
+image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.")
+final_answer(image)
+```
+
+---
+Task: "What is the result of the following operation: 5 + 3 + 1294.678?"
-{{managed_agents_descriptions}}
+Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool
+Code:
+```py
+result = 5 + 3 + 1294.678
+final_answer(result)
+```
+
+---
+Task:
+"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French.
+You have been provided with these additional arguments, that you can access using the keys as variables in your python code:
+{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}"
+
+Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image.
+Code:
+```py
+translated_question = translator(question=question, src_lang="French", tgt_lang="English")
+print(f"The translated question is {translated_question}.")
+answer = image_qa(image=image, question=translated_question)
+final_answer(f"The answer is {answer}")
+```
+
+---
+Task:
+In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer.
+What does he say was the consequence of Einstein learning too much math on his creativity, in one word?
+
+Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin.
+Code:
+```py
+pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein")
+print(pages)
+```
+Observation:
+No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein".
+
+Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query.
+Code:
+```py
+pages = search(query="1979 interview Stanislaus Ulam")
+print(pages)
+```
+Observation:
+Found 6 pages:
+[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/)
+
+[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/)
+
+(truncated)
+
+Thought: I will read the first 2 pages to know more.
+Code:
+```py
+for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]:
+ whole_page = visit_webpage(url)
+ print(whole_page)
+ print("\n" + "="*80 + "\n") # Print separator between pages
+```
+Observation:
+Manhattan Project Locations:
+Los Alamos, NM
+Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at
+(truncated)
+
+Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word.
+Code:
+```py
+final_answer("diminished")
+```
+
+---
+Task: "Which city has the highest population: Guangzhou or Shanghai?"
+
+Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities.
+Code:
+```py
+for city in ["Guangzhou", "Shanghai"]:
+ print(f"Population {city}:", search(f"{city} population")
+```
+Observation:
+Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.']
+Population Shanghai: '26 million (2019)'
+
+Thought: Now I know that Shanghai has the highest population.
+Code:
+```py
+final_answer("Shanghai")
+```
+
+---
+Task: "What is the current age of the pope, raised to the power 0.36?"
+
+Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search.
+Code:
+```py
+pope_age_wiki = wiki(query="current pope age")
+print("Pope age as per wikipedia:", pope_age_wiki)
+pope_age_search = web_search(query="current pope age")
+print("Pope age as per google search:", pope_age_search)
+```
+Observation:
+Pope age: "The pope Francis is currently 88 years old."
+
+Thought: I know that the pope is 88 years old. Let's compute the result using python code.
+Code:
+```py
+pope_current_age = 88 ** 0.36
+final_answer(pope_current_age)
+```
+
+Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+ Takes inputs: {{tool.inputs}}
+ Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+
+{%- if managed_agents and managed_agents.values() | list %}
+You can also give tasks to team members.
+Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+Given that this team member is a real human, you should be very verbose in your task.
+Here is a list of the team members that you can call:
+{%- for agent in managed_agents.values() %}
+- {{ agent.name }}: {{ agent.description }}
+{%- endfor %}
+{%- endif %}
Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```' sequence, else you will fail.
@@ -224,7 +348,7 @@ Here are the rules you should always follow to solve your task:
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
-7. Never create any notional variables in our code, as having these in your logs might derail you from the true variables.
+7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
@@ -232,11 +356,29 @@ Here are the rules you should always follow to solve your task:
Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
```
-ๅฆไฝ ๆ่ง๏ผๆไธไบๅ ไฝ็ฌฆ๏ผๅฆ `"{{tool_descriptions}}"`๏ผ่ฟไบๅฐๅจ agent ๅๅงๅๆถ็จไบๆๅ ฅๆไบ่ชๅจ็ๆ็ๅทฅๅ ทๆ็ฎก็ agent ็ๆ่ฟฐใ
+ๅฆไฝ ๆ่ง๏ผๆไธไบๅ ไฝ็ฌฆ๏ผๅฆ `"{{ tool.description }}"`๏ผ่ฟไบๅฐๅจ agent ๅๅงๅๆถ็จไบๆๅ ฅๆไบ่ชๅจ็ๆ็ๅทฅๅ ทๆ็ฎก็ agent ็ๆ่ฟฐใ
ๅ ๆญค๏ผ่ฝ็ถไฝ ๅฏไปฅ้่ฟๅฐ่ชๅฎไนๆ็คบไฝไธบๅๆฐไผ ้็ป `system_prompt` ๅๆฐๆฅ่ฆ็ๆญค็ณป็ปๆ็คบๆจกๆฟ๏ผไฝไฝ ็ๆฐ็ณป็ปๆ็คบๅฟ ้กปๅ ๅซไปฅไธๅ ไฝ็ฌฆ๏ผ
-- `"{{tool_descriptions}}"` ็จไบๆๅ ฅๅทฅๅ ทๆ่ฟฐใ
-- `"{{managed_agents_description}}"` ็จไบๆๅ ฅ managed agent ็ๆ่ฟฐ๏ผๅฆๆๆ๏ผใ
+- ็จไบๆๅ ฅๅทฅๅ ทๆ่ฟฐใ
+ ```
+ {%- for tool in tools.values() %}
+ - {{ tool.name }}: {{ tool.description }}
+ Takes inputs: {{tool.inputs}}
+ Returns an output of type: {{tool.output_type}}
+ {%- endfor %}
+ ```
+- ็จไบๆๅ ฅ managed agent ็ๆ่ฟฐ๏ผๅฆๆๆ๏ผใ
+ ```
+ {%- if managed_agents and managed_agents.values() | list %}
+ You can also give tasks to team members.
+ Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+ Given that this team member is a real human, you should be very verbose in your task.
+ Here is a list of the team members that you can call:
+ {%- for agent in managed_agents.values() %}
+ - {{ agent.name }}: {{ agent.description }}
+ {%- endfor %}
+ {%- endif %}
+ ```
- ไป ้ `CodeAgent`๏ผ`"{{authorized_imports}}"` ็จไบๆๅ ฅๆๆๅฏผๅ ฅๅ่กจใ
็ถๅไฝ ๅฏไปฅๆ นๆฎๅฆไธ๏ผๆดๆน็ณป็ปๆ็คบ๏ผ
@@ -253,7 +395,7 @@ agent.prompt_templates["system_prompt"] = agent.prompt_templates["system_prompt"
ๆไปฌๆไพไบไธไธช็จไบ่กฅๅ ่งๅๆญฅ้ชค็ๆจกๅ๏ผagent ๅฏไปฅๅจๆญฃๅธธๆไฝๆญฅ้ชคไน้ดๅฎๆ่ฟ่กใๅจๆญคๆญฅ้ชคไธญ๏ผๆฒกๆๅทฅๅ ท่ฐ็จ๏ผLLM ๅชๆฏ่ขซ่ฆๆฑๆดๆฐๅฎ็ฅ้็ไบๅฎๅ่กจ๏ผๅนถๆ นๆฎ่ฟไบไบๅฎๅๆจๅฎๅบ่ฏฅ้ๅ็ไธไธๆญฅใ
```py
-from smolagents import load_tool, CodeAgent, HfApiModel, DuckDuckGoSearchTool
+from smolagents import load_tool, CodeAgent, InferenceClientModel, DuckDuckGoSearchTool
from dotenv import load_dotenv
load_dotenv()
@@ -265,7 +407,7 @@ search_tool = DuckDuckGoSearchTool()
agent = CodeAgent(
tools=[search_tool],
- model=HfApiModel("Qwen/Qwen2.5-72B-Instruct"),
+ model=InferenceClientModel(model_id="Qwen/Qwen2.5-72B-Instruct"),
planning_interval=3 # ่ฟๆฏไฝ ๆฟๆดป่งๅ็ๅฐๆน๏ผ
)
diff --git a/docs/source/zh/tutorials/inspect_runs.mdx b/docs/source/zh/tutorials/inspect_runs.mdx
new file mode 100644
index 000000000..ea3eb659b
--- /dev/null
+++ b/docs/source/zh/tutorials/inspect_runs.mdx
@@ -0,0 +1,180 @@
+# ไฝฟ็จ OpenTelemetry ๆฃๆฅ่ฟ่ก่ฎฐๅฝ
+
+[[open-in-colab]]
+
+> [!TIP]
+> ๅฆๆๆจๆฏๅๆฌกๆๅปบAgent๏ผๅปบ่ฎฎๅ ้ ่ฏป [Agent ๅ ฅ้จๆๅ](../conceptual_guides/intro_agents) ๅ [smolagents ๅฏผ่ง](../guided_tour)ใ
+
+## ไธบไปไน้่ฆ่ฎฐๅฝAgent่ฟ่ก๏ผ
+
+่ฐ่ฏAgent่ฟ่ก่ฟ็จๅ ทๆๆๆๆงใ
+
+้ช่ฏ่ฟ่กๆฏๅฆๆญฃๅธธ่ฟ่กๅพๅฐ้พ๏ผๅ ไธบAgent็ๅทฅไฝๆต็จๆฌ่บซๅ ทๆ [่ฎพ่ฎกไธ็ไธๅฏ้ขๆตๆง](../conceptual_guides/intro_agents)๏ผๅฆๆๅฏ้ขๆต๏ผ็ดๆฅไฝฟ็จไผ ็ปไปฃ็ ๅณๅฏ๏ผใ
+
+ๆฃๆฅ่ฟ่ก่ฎฐๅฝๅๆ ทๅฐ้พ๏ผๅคๆญฅ้ชค็Agentๅพๅพไผๅฟซ้ๅจๆงๅถๅฐ็ๆๅคง้ๆฅๅฟ๏ผ่ๅคงๅคๆฐ้่ฏฏๅชๆฏ"LLM ไฝ็บง้่ฏฏ"็ฑปๅ็้ฎ้ข๏ผ้ๅธธLLMไผๅจๅ็ปญๆญฅ้ชคไธญ้่ฟ็ๆๆดๅฅฝ็ไปฃ็ ๆๅทฅๅ ท่ฐ็จๆฅ่ชๆไฟฎๆญฃใ
+
+ๅ ๆญค๏ผๅจ็ไบง็ฏๅขไธญไฝฟ็จ็ๆงๅทฅๅ ท่ฎฐๅฝAgent่ฟ่ก่ฟ็จ๏ผๅฏนไบๅ็ปญๆฃๆฅๅๅๆ่ณๅ ณ้่ฆ๏ผ
+
+ๆไปฌ้็จ [OpenTelemetry](https://opentelemetry.io/) ๆ ๅๆฅๅฎ็ฐAgent่ฟ่ก็ๆงใ
+
+่ฟๆๅณ็ๆจๅช้ๆทปๅ ๅฐ้็ๆงไปฃ็ ๏ผๅณๅฏๅจๆญฃๅธธ่ฟ่กAgentๆถ่ชๅจ่ฎฐๅฝๆๆไฟกๆฏๅฐ็ๆงๅนณๅฐใไปฅไธๆฏๅจไธๅOpenTelemetryๅ็ซฏๅฎ็ฐๆญคๅ่ฝ็็คบไพ๏ผ
+
+ๅจ็ๆงๅนณๅฐไธ็ๅฑ็คบๆๆๅฆไธ๏ผ
+
+
+
+
+
+
+## ไฝฟ็จ Arize AI Phoenix ้ ็ฝฎ้ฅๆต
+
+้ฆๅ ๅฎ่ฃ ๅฟ ่ฆ็่ฝฏไปถๅ ใ่ฟ้ๆไปฌ้ๆฉๅฎ่ฃ [Arize AI ็ Phoenix](https://github.com/Arize-ai/phoenix) ไฝไธบๆฅๅฟๆถ้ๅๆฃๆฅๆนๆก๏ผๆจไนๅฏไปฅไฝฟ็จๅ ถไปๅ ผๅฎน OpenTelemetry ็ๅนณๅฐๆฅๅฎๆๆถ้ไธๆฃๆฅๅทฅไฝใ
+
+```shell
+pip install 'smolagents[telemetry]'
+```
+
+ๆฅ็ๅจๅๅฐ่ฟ่กๆฅๅฟๆถ้ๅจ๏ผ
+
+```shell
+python -m phoenix.server.main serve
+```
+
+ๆๅ้ ็ฝฎ `SmolagentsInstrumentor` ๆฅ่ฟฝ่ธชAgentๆดปๅจ๏ผๅนถๅฐ่ฟฝ่ธชๆฐๆฎๅ้่ณ Phoenix ้ป่ฎค็ซฏ็น๏ผ
+
+```python
+from phoenix.otel import register
+from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+
+register()
+SmolagentsInstrumentor().instrument()
+```
+
+ๅฎๆไธ่ฟฐ้ ็ฝฎๅ๏ผๅณๅฏๆญฃๅธธ่ฟ่กๆจ็Agent๏ผ
+
+```py
+from smolagents import (
+ CodeAgent,
+ ToolCallingAgent,
+ DuckDuckGoSearchTool,
+ VisitWebpageTool,
+ InferenceClientModel,
+)
+
+model = InferenceClientModel()
+
+search_agent = ToolCallingAgent(
+ tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
+ model=model,
+ name="search_agent",
+ description="This is an agent that can do web search.",
+)
+
+manager_agent = CodeAgent(
+ tools=[],
+ model=model,
+ managed_agents=[search_agent],
+)
+manager_agent.run(
+ "If the US keeps its 2024 growth rate, how many years will it take for the GDP to double?"
+)
+```
+Voilร !
+
+ๆญคๆถ่ฎฟ้ฎ `http://0.0.0.0:6006/projects/` ๅณๅฏๆฅ็่ฟ่ก่ฎฐๅฝ๏ผ
+
+
+
+ๅฆๅพๆ็คบ๏ผCodeAgent ่ฐ็จไบๅ ถๆ็ฎก็ ToolCallingAgent๏ผๆณจ๏ผๆ็ฎกAgentไนๅฏไปฅๆฏๅฆไธไธช CodeAgent๏ผๆง่ก็พๅฝ2024ๅนด็ปๆตๅข้ฟ็็็ฝ็ปๆ็ดขใๆ็ฎกAgent่ฟๅๆฅๅๅ๏ผ็ฎก็Agentๆ นๆฎ็ปๆ่ฎก็ฎๅบ็ปๆต็ฟปๅๅจๆ๏ผๆฏไธๆฏๅพๆบ่ฝ๏ผ
+
+## ไฝฟ็จ Langfuse ้ ็ฝฎ้ฅๆต
+
+ๆฌ้จๅๆผ็คบๅฆไฝ้่ฟ `SmolagentsInstrumentor` ไฝฟ็จ **Langfuse** ็ๆงๅ่ฐ่ฏ Hugging Face **smolagents**ใ
+
+> **Langfuse ๆฏไปไน๏ผ** [Langfuse](https://langfuse.com) ๆฏ้ขๅLLMๅทฅ็จ็ๅผๆบๅนณๅฐ๏ผๆไพAI Agent็่ฟฝ่ธชไธ็ๆงๅ่ฝ๏ผๅธฎๅฉๅผๅ่ ่ฐ่ฏใๅๆๅไผๅไบงๅใ่ฏฅๅนณๅฐ้่ฟๅ็้ๆใOpenTelemetry ๅ SDKs ไธๅ็ฑปๅทฅๅ ทๆกๆถๅฏนๆฅใ
+
+### ๆญฅ้ชค 1: ๅฎ่ฃ ไพ่ต
+
+```python
+%pip install smolagents
+%pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents
+```
+
+### ๆญฅ้ชค 2: ้ ็ฝฎ็ฏๅขๅ้
+
+่ฎพ็ฝฎ Langfuse API ๅฏ้ฅ๏ผๅนถ้ ็ฝฎ OpenTelemetry ็ซฏ็นๅฐ่ฟฝ่ธชๆฐๆฎๅ้่ณ Langfuseใ้่ฟๆณจๅ [Langfuse Cloud](https://cloud.langfuse.com) ๆ [่ชๆ็ฎก Langfuse](https://langfuse.com/self-hosting) ่ทๅ API ๅฏ้ฅใ
+
+ๅๆถ้ๆทปๅ [Hugging Face ไปค็](https://huggingface.co/settings/tokens) (`HF_TOKEN`) ไฝไธบ็ฏๅขๅ้๏ผ
+```python
+import os
+import base64
+
+LANGFUSE_PUBLIC_KEY="pk-lf-..."
+LANGFUSE_SECRET_KEY="sk-lf-..."
+LANGFUSE_AUTH=base64.b64encode(f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()).decode()
+
+os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel" # EU data region
+# os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://us.cloud.langfuse.com/api/public/otel" # US data region
+os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"
+
+# your Hugging Face token
+os.environ["HF_TOKEN"] = "hf_..."
+```
+
+### ๆญฅ้ชค 3: ๅๅงๅ `SmolagentsInstrumentor`
+
+ๅจๅบ็จ็จๅบไปฃ็ ๆง่กๅๅๅงๅ `SmolagentsInstrumentor`ใ้ ็ฝฎ `tracer_provider` ๅนถๆทปๅ span processor ๅฐ่ฟฝ่ธชๆฐๆฎๅฏผๅบ่ณ Langfuseใ`OTLPSpanExporter()` ไผ่ชๅจไฝฟ็จ็ฏๅขๅ้ไธญ้ ็ฝฎ็็ซฏ็นๅ่ฏทๆฑๅคดใ
+
+
+```python
+from opentelemetry.sdk.trace import TracerProvider
+
+from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+
+trace_provider = TracerProvider()
+trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
+
+SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
+```
+
+### ๆญฅ้ชค 4: ่ฟ่ก smolagent
+
+```python
+from smolagents import (
+ CodeAgent,
+ ToolCallingAgent,
+ DuckDuckGoSearchTool,
+ VisitWebpageTool,
+ InferenceClientModel,
+)
+
+model = InferenceClientModel(
+ model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+)
+
+search_agent = ToolCallingAgent(
+ tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
+ model=model,
+ name="search_agent",
+ description="This is an agent that can do web search.",
+)
+
+manager_agent = CodeAgent(
+ tools=[],
+ model=model,
+ managed_agents=[search_agent],
+)
+manager_agent.run(
+ "How can Langfuse be used to monitor and improve the reasoning and decision-making of smolagents when they execute multi-step tasks, like dynamically adjusting a recipe based on user feedback or available ingredients?"
+)
+```
+
+### ๆญฅ้ชค 5: ๅจ Langfuse ไธญๆฅ็่ฟฝ่ธช่ฎฐๅฝ
+
+่ฟ่กAgentๅ๏ผๆจๅฏไปฅๅจ [Langfuse](https://cloud.langfuse.com) ๅนณๅฐๆฅ็ smolagents ๅบ็จ็ๆ็่ฟฝ่ธช่ฎฐๅฝใ่ฟไบ่ฎฐๅฝไผ่ฏฆ็ปๅฑ็คบLLM็ไบคไบๆญฅ้ชค๏ผๅธฎๅฉๆจ่ฐ่ฏๅไผๅAIไปฃ็ใ
+
+
+
+_[Langfuse ๅ ฌๅผ็คบไพ่ฟฝ่ธช](https://cloud.langfuse.com/project/cloramnkj0002jz088vzn1ja4/traces/ce5160f9bfd5a6cd63b07d2bfcec6f54?timestamp=2025-02-11T09%3A25%3A45.163Z&display=details)_
\ No newline at end of file
diff --git a/docs/source/zh/tutorials/memory.mdx b/docs/source/zh/tutorials/memory.mdx
new file mode 100644
index 000000000..de2bdc8c3
--- /dev/null
+++ b/docs/source/zh/tutorials/memory.mdx
@@ -0,0 +1,131 @@
+# ๐ ็ฎก็Agent็่ฎฐๅฟ
+
+[[open-in-colab]]
+
+ๅฝๆ น็ปๅบ๏ผAgentๅฏไปฅๅฎไนไธบ็ฑๅ ไธช็ฎๅ็ปไปถๆๆ๏ผๅฎๆฅๆๅทฅๅ ทใๆ็คบ่ฏใๆ้่ฆ็ๆฏ๏ผๅฎๅ ทๅคๅฏน่ฟๅพๆญฅ้ชค็่ฎฐๅฟ๏ผ่ฝๅค่ฟฝๆบฏๅฎๆด็่งๅใๆง่กๅ้่ฏฏๅๅฒใ
+
+### ๅๆพAgent็่ฎฐๅฟ
+
+ๆไปฌๆไพไบๅค้กนๅ่ฝๆฅๅฎกๆฅAgent็่ฟๅพ่ฟ่ก่ฎฐๅฝใ
+
+ๆจๅฏไปฅ้่ฟๆ่ฃ ๏ผinstrumentation๏ผๅจๅฏ่งๅ็้ขไธญๆฅ็Agent็่ฟ่ก่ฟ็จ๏ผ่ฏฅ็้ขๆฏๆๅฏน็นๅฎๆญฅ้ชค่ฟ่ก็ผฉๆพๆไฝ๏ผๅ ทไฝๆนๆณๅ่ง[ๆ่ฃ ๆๅ](./inspect_runs)ใ
+
+ๆจไนๅฏไปฅไฝฟ็จ`agent.replay()`ๆนๆณๅฎ็ฐๅๆพ๏ผ
+
+ๅฝAgentๅฎๆ่ฟ่กๅ๏ผ
+```py
+from smolagents import InferenceClientModel, CodeAgent
+
+agent = CodeAgent(tools=[], model=InferenceClientModel(), verbosity_level=0)
+
+result = agent.run("What's the 20th Fibonacci number?")
+```
+
+่ฅ่ฆๅๆพๆ่ฟไธๆฌก่ฟ่ก๏ผๅช้ไฝฟ็จ๏ผ
+```py
+agent.replay()
+```
+
+### ๅจๆไฟฎๆนAgent็่ฎฐๅฟ
+
+่ฎธๅค้ซ็บงๅบ็จๅบๆฏ้่ฆๅฏนAgent็่ฎฐๅฟ่ฟ่กๅจๆไฟฎๆนใ
+
+ๆจๅฏไปฅ้่ฟไปฅไธๆนๅผ่ฎฟ้ฎAgent็่ฎฐๅฟ๏ผ
+
+```py
+from smolagents import ActionStep
+
+system_prompt_step = agent.memory.system_prompt
+print("The system prompt given to the agent was:")
+print(system_prompt_step.system_prompt)
+
+task_step = agent.memory.steps[0]
+print("\n\nThe first task step was:")
+print(task_step.task)
+
+for step in agent.memory.steps:
+ if isinstance(step, ActionStep):
+ if step.error is not None:
+ print(f"\nStep {step.step_number} got this error:\n{step.error}\n")
+ else:
+ print(f"\nStep {step.step_number} got these observations:\n{step.observations}\n")
+```
+
+ไฝฟ็จ`agent.memory.get_full_steps()`ๅฏ่ทๅๅฎๆดๆญฅ้ชคๅญๅ ธๆฐๆฎใ
+
+ๆจ่ฟๅฏไปฅ้่ฟๆญฅ้ชคๅ่ฐ๏ผstep callbacks๏ผๅฎ็ฐ่ฎฐๅฟ็ๅจๆไฟฎๆนใ
+
+ๆญฅ้ชคๅ่ฐๅฝๆฐๅฏ้่ฟๅๆฐ็ดๆฅ่ฎฟ้ฎ`agent`ๅฏน่ฑก๏ผๅ ๆญค่ฝๅค่ฎฟ้ฎๆๆ่ฎฐๅฟๆญฅ้ชคๅนถๆ นๆฎ้่ฆ่ฟ่กไฟฎๆนใไพๅฆ๏ผๅ่ฎพๆจๆญฃๅจ็ๆง็ฝ้กตๆต่งAgentๆฏไธชๆญฅ้ชค็ๅฑๅนๆชๅพ๏ผๅธๆไฟ็ๆๆฐๆชๅพๅๆถๅ ้คๆงๆญฅ้ชค็ๅพ็ไปฅ่็tokenๆถ่ใ
+
+ๅฏๅ่ไปฅไธไปฃ็ ็คบไพ๏ผ
+_ๆณจ๏ผๆญคไปฃ็ ็ๆฎตไธๅฎๆด๏ผ้จๅๅฏผๅ ฅ่ฏญๅฅๅๅฏน่ฑกๅฎไนๅทฒ็ฒพ็ฎ๏ผๅฎๆดไปฃ็ ่ฏท่ฎฟ้ฎ[ๅๅง่ๆฌ](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py)_
+
+```py
+import helium
+from PIL import Image
+from io import BytesIO
+from time import sleep
+
+def update_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
+ driver = helium.get_driver()
+ latest_step = memory_step.step_number
+ for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= latest_step - 2:
+ previous_memory_step.observations_images = None
+ png_bytes = driver.get_screenshot_as_png()
+ image = Image.open(BytesIO(png_bytes))
+ memory_step.observations_images = [image.copy()]
+```
+
+ๆๅๅจๅๅงๅAgentๆถ๏ผๅฐๆญคๅฝๆฐไผ ๅ ฅ`step_callbacks`ๅๆฐ๏ผ
+
+```py
+CodeAgent(
+ tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
+ model=model,
+ additional_authorized_imports=["helium"],
+ step_callbacks=[update_screenshot],
+ max_steps=20,
+ verbosity_level=2,
+)
+```
+
+่ฏท่ฎฟ้ฎๆไปฌ็ [vision web browser code](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py) ๆฅ็ๅฎๆดๅฏ่ฟ่ก็คบไพใ
+
+### ๅๆญฅ่ฟ่ก Agents
+
+ๅฝๆจ้่ฆๅค็่ๆถๆฐๅคฉ็ๅทฅๅ ท่ฐ็จๆถ๏ผ่ฟ็งๆนๅผ็นๅซๆ็จ๏ผๆจๅฏไปฅ้ๆญฅๆง่กAgentsใ่ฟ่ฟๅ ่ฎธๆจๅจๆฏไธๆญฅๆดๆฐ่ฎฐๅฟใ
+
+```py
+from smolagents import InferenceClientModel, CodeAgent, ActionStep, TaskStep
+
+agent = CodeAgent(tools=[], model=InferenceClientModel(), verbosity_level=1)
+print(agent.memory.system_prompt)
+
+task = "What is the 20th Fibonacci number?"
+
+# You could modify the memory as needed here by inputting the memory of another agent.
+# agent.memory.steps = previous_agent.memory.steps
+
+# Let's start a new task!
+agent.memory.steps.append(TaskStep(task=task, task_images=[]))
+
+final_answer = None
+step_number = 1
+while final_answer is None and step_number <= 10:
+ memory_step = ActionStep(
+ step_number=step_number,
+ observations_images=[],
+ )
+ # Run one step.
+ final_answer = agent.step(memory_step)
+ agent.memory.steps.append(memory_step)
+ step_number += 1
+
+ # Change the memory as you please!
+ # For instance to update the latest step:
+ # agent.memory.steps[-1] = ...
+
+print("The final answer is:", final_answer)
+```
\ No newline at end of file
diff --git a/docs/source/zh/tutorials/secure_code_execution.mdx b/docs/source/zh/tutorials/secure_code_execution.mdx
index 6017aefb9..93e80986a 100644
--- a/docs/source/zh/tutorials/secure_code_execution.mdx
+++ b/docs/source/zh/tutorials/secure_code_execution.mdx
@@ -1,18 +1,3 @@
-
# ๅฎๅ จไปฃ็ ๆง่ก
[[open-in-colab]]
@@ -41,7 +26,7 @@ rendered properly in your Markdown viewer.
### ๆฌๅฐ Python ่งฃ้ๅจ
้ป่ฎคๆ ๅตไธ๏ผ`CodeAgent` ไผๅจไฝ ็็ฏๅขไธญ่ฟ่ก LLM ็ๆ็ไปฃ็ ใ
-่ฟไธชๆง่กไธๆฏ็ฑๆฎ้็ Python ่งฃ้ๅจๅฎๆ็๏ผๆไปฌไป้ถๅผๅง้ๆฐๆๅปบไบไธไธชๆดๅฎๅ จ็ `LocalPythonInterpreter`ใ
+่ฟไธชๆง่กไธๆฏ็ฑๆฎ้็ Python ่งฃ้ๅจๅฎๆ็๏ผๆไปฌไป้ถๅผๅง้ๆฐๆๅปบไบไธไธชๆดๅฎๅ จ็ `LocalPythonExecutor`ใ
่ฟไธช่งฃ้ๅจ้่ฟไปฅไธๆนๅผ่ฎพ่ฎกไปฅ็กฎไฟๅฎๅ จ๏ผ
- ๅฐๅฏผๅ ฅ้ๅถไธบ็จๆทๆพๅผไผ ้็ๅ่กจ
- ้ๅถๆไฝๆฌกๆฐไปฅ้ฒๆญขๆ ้ๅพช็ฏๅ่ตๆบ่จ่
@@ -64,16 +49,16 @@ rendered properly in your Markdown viewer.
็ฐๅจไฝ ๅทฒ็ปๅๅคๅฅฝไบ๏ผ
-่ฆๅฐไปฃ็ ๆง่กๅจ่ฎพ็ฝฎไธบ E2B๏ผๅช้ๅจๅๅงๅ `CodeAgent` ๆถไผ ้ๆ ๅฟ `use_e2b_executor=True`ใ
+่ฆๅฐไปฃ็ ๆง่กๅจ่ฎพ็ฝฎไธบ E2B๏ผๅช้ๅจๅๅงๅ `CodeAgent` ๆถไผ ้ๆ ๅฟ `executor_type="e2b"`ใ
่ฏทๆณจๆ๏ผไฝ ๅบ่ฏฅๅฐๆๆๅทฅๅ ท็ไพ่ต้กนๆทปๅ ๅฐ `additional_authorized_imports` ไธญ๏ผไปฅไพฟๆง่กๅจๅฎ่ฃ ๅฎไปฌใ
```py
-from smolagents import CodeAgent, VisitWebpageTool, HfApiModel
+from smolagents import CodeAgent, VisitWebpageTool, InferenceClientModel
agent = CodeAgent(
tools = [VisitWebpageTool()],
- model=HfApiModel(),
+ model=InferenceClientModel(),
additional_authorized_imports=["requests", "markdownify"],
- use_e2b_executor=True
+ executor_type="e2b"
)
agent.run("What was Abraham Lincoln's preferred pet?")
diff --git a/docs/source/zh/tutorials/tools.mdx b/docs/source/zh/tutorials/tools.mdx
index e62f6b660..9256bd0a3 100644
--- a/docs/source/zh/tutorials/tools.mdx
+++ b/docs/source/zh/tutorials/tools.mdx
@@ -1,18 +1,3 @@
-
# ๅทฅๅ ท
[[open-in-colab]]
@@ -133,9 +118,9 @@ image_generation_tool("A sunny beach")
็ถๅไฝ ๅฏไปฅๅไฝฟ็จไปปไฝๅ ถไปๅทฅๅ ทไธๆ ทไฝฟ็จ่ฟไธชๅทฅๅ ทใไพๅฆ๏ผ่ฎฉๆไปฌๆน่ฟๆ็คบ `A rabbit wearing a space suit` ๅนถ็ๆๅฎ็ๅพ็ใ
```python
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
-model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
agent = CodeAgent(tools=[image_generation_tool], model=model)
agent.run(
@@ -181,9 +166,9 @@ agent.run("How many more blocks (also denoted as layers) are in BERT base encode
่ฎฉๆไปฌๅฐ `model_download_tool` ๆทปๅ ๅฐไธไธชไป ไฝฟ็จ้ป่ฎคๅทฅๅ ท็ฎฑๅๅงๅ็็ฐๆ agent ไธญใ
```python
-from smolagents import HfApiModel
+from smolagents import InferenceClientModel
-model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct")
+model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct")
agent = CodeAgent(tools=[], model=model, add_base_tools=True)
agent.tools[model_download_tool.name] = model_download_tool
diff --git a/e2b.Dockerfile b/e2b.Dockerfile
deleted file mode 100644
index cd6dd29c8..000000000
--- a/e2b.Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-# You can use most Debian-based base images
-FROM e2bdev/code-interpreter:latest
-
-# Install dependencies and customize sandbox
-RUN pip install git+https://github.com/huggingface/smolagents.git
\ No newline at end of file
diff --git a/examples/agent_from_any_llm.py b/examples/agent_from_any_llm.py
index 86f45effb..bc421274c 100644
--- a/examples/agent_from_any_llm.py
+++ b/examples/agent_from_any_llm.py
@@ -1,18 +1,19 @@
-from typing import Optional
-
-from smolagents import HfApiModel, LiteLLMModel, TransformersModel, tool
+from smolagents import InferenceClientModel, LiteLLMModel, OpenAIServerModel, TransformersModel, tool
from smolagents.agents import CodeAgent, ToolCallingAgent
# Choose which inference type to use!
-available_inferences = ["hf_api", "transformers", "ollama", "litellm"]
-chosen_inference = "transformers"
+available_inferences = ["hf_api", "hf_api_provider", "transformers", "ollama", "litellm", "openai"]
+chosen_inference = "hf_api_provider"
print(f"Chose model: '{chosen_inference}'")
if chosen_inference == "hf_api":
- model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
+ model = InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
+
+elif chosen_inference == "hf_api_provider":
+ model = InferenceClientModel(provider="together")
elif chosen_inference == "transformers":
model = TransformersModel(model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", device_map="auto", max_new_tokens=1000)
@@ -29,9 +30,13 @@
# For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-latest'
model = LiteLLMModel(model_id="gpt-4o")
+elif chosen_inference == "openai":
+ # For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-latest'
+ model = OpenAIServerModel(model_id="gpt-4o")
+
@tool
-def get_weather(location: str, celsius: Optional[bool] = False) -> str:
+def get_weather(location: str, celsius: bool | None = False) -> str:
"""
Get weather in the next days at given location.
Secretly this tool does not care about the location, it hates the weather everywhere.
@@ -43,10 +48,10 @@ def get_weather(location: str, celsius: Optional[bool] = False) -> str:
return "The weather is UNGODLY with torrential rains and temperatures below -10ยฐC"
-agent = ToolCallingAgent(tools=[get_weather], model=model)
+agent = ToolCallingAgent(tools=[get_weather], model=model, verbosity_level=2)
print("ToolCallingAgent:", agent.run("What's the weather like in Paris?"))
-agent = CodeAgent(tools=[get_weather], model=model)
+agent = CodeAgent(tools=[get_weather], model=model, verbosity_level=2)
print("CodeAgent:", agent.run("What's the weather like in Paris?"))
diff --git a/examples/benchmark.ipynb b/examples/benchmark.ipynb
deleted file mode 100644
index 79f0ae0a1..000000000
--- a/examples/benchmark.ipynb
+++ /dev/null
@@ -1,1195 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Constants and utilities/tools"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Benchmark date\n",
- "# - set a concrete date:\n",
- "DATE = \"2024-12-26\"\n",
- "# - or use default: today\n",
- "# DATE = None\n",
- "\n",
- "# Evaluation dataset\n",
- "# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n",
- "EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n",
- "\n",
- "# Answers dataset: it must be a gated dataset; required to score the answers\n",
- "ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n",
- "# Whether to push the answers dataset to the Hub\n",
- "PUSH_ANSWERS_DATASET_TO_HUB = True\n",
- "\n",
- "# Results dataset\n",
- "RESULTS_DATASET = \"smolagents-benchmark/results\"\n",
- "# Whether to push the results dataset to the Hub\n",
- "PUSH_RESULTS_DATASET_TO_HUB = True\n",
- "\n",
- "\n",
- "import datetime\n",
- "import json\n",
- "import os\n",
- "import re\n",
- "import string\n",
- "import time\n",
- "import warnings\n",
- "from typing import List\n",
- "\n",
- "import datasets\n",
- "from dotenv import load_dotenv\n",
- "from tqdm import tqdm\n",
- "\n",
- "from smolagents import (\n",
- " AgentError,\n",
- " CodeAgent,\n",
- " GoogleSearchTool,\n",
- " HfApiModel,\n",
- " PythonInterpreterTool,\n",
- " ToolCallingAgent,\n",
- " VisitWebpageTool,\n",
- ")\n",
- "from smolagents.agents import ActionStep\n",
- "\n",
- "\n",
- "load_dotenv()\n",
- "os.makedirs(\"output\", exist_ok=True)\n",
- "\n",
- "\n",
- "def serialize_agent_error(obj):\n",
- " if isinstance(obj, AgentError):\n",
- " return {\"error_type\": obj.__class__.__name__, \"message\": obj.message}\n",
- " else:\n",
- " return str(obj)\n",
- "\n",
- "\n",
- "def answer_questions(\n",
- " eval_ds,\n",
- " agent,\n",
- " model_id,\n",
- " action_type,\n",
- " is_vanilla_llm=False,\n",
- " date=DATE,\n",
- " output_dir=\"output\",\n",
- " push_to_hub_dataset=ANSWERS_DATASET if PUSH_ANSWERS_DATASET_TO_HUB else None,\n",
- "):\n",
- " date = date or datetime.date.today().isoformat()\n",
- "\n",
- " for task in eval_ds:\n",
- " file_name = f\"output/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl\"\n",
- " answered_questions = []\n",
- " if os.path.exists(file_name):\n",
- " with open(file_name, \"r\") as f:\n",
- " for line in f:\n",
- " answered_questions.append(json.loads(line)[\"question\"])\n",
- "\n",
- " for _, example in tqdm(enumerate(eval_ds[task]), total=len(eval_ds[task])):\n",
- " try:\n",
- " question = example[\"question\"]\n",
- " if example[\"source\"] == \"SimpleQA\":\n",
- " question += \" Answer with only the final number.\"\n",
- " if example[\"source\"] == \"MATH\":\n",
- " question += \" Write code, not latex.\"\n",
- " if question in answered_questions:\n",
- " continue\n",
- " start_time = time.time()\n",
- "\n",
- " if is_vanilla_llm:\n",
- " llm = agent\n",
- " answer = str(llm([{\"role\": \"user\", \"content\": question}]).content)\n",
- " token_count = {\n",
- " \"input\": llm.last_input_token_count,\n",
- " \"output\": llm.last_output_token_count,\n",
- " }\n",
- " intermediate_steps = str([])\n",
- " else:\n",
- " answer = str(agent.run(question))\n",
- " token_count = agent.monitor.get_total_token_counts()\n",
- " intermediate_steps = str(agent.logs)\n",
- " # Remove memory from logs to make them more compact.\n",
- " for step in agent.logs:\n",
- " if isinstance(step, ActionStep):\n",
- " step.agent_memory = None\n",
- "\n",
- " end_time = time.time()\n",
- " annotated_example = {\n",
- " \"model_id\": model_id,\n",
- " \"agent_action_type\": action_type,\n",
- " \"question\": question,\n",
- " \"answer\": answer,\n",
- " \"true_answer\": example[\"true_answer\"],\n",
- " \"source\": example[\"source\"],\n",
- " \"intermediate_steps\": intermediate_steps,\n",
- " \"start_time\": start_time,\n",
- " \"end_time\": end_time,\n",
- " \"token_counts\": token_count,\n",
- " }\n",
- "\n",
- " with open(file_name, \"a\") as f:\n",
- " json.dump(annotated_example, f, default=serialize_agent_error)\n",
- " f.write(\"\\n\") # add a newline for JSONL format\n",
- " except Exception as e:\n",
- " print(\"Failed:\", e)\n",
- "\n",
- " if push_to_hub_dataset:\n",
- " ds = datasets.Dataset.from_pandas(pd.read_json(file_name, lines=True), split=\"test\", preserve_index=False)\n",
- " config = f\"{model_id.replace('/', '__')}__{action_type}__{task}\"\n",
- " data_dir = f\"{model_id}/{action_type}/{task}/{date}\"\n",
- " ds.push_to_hub(\n",
- " push_to_hub_dataset,\n",
- " config_name=config,\n",
- " data_dir=data_dir,\n",
- " split=\"test\",\n",
- " commit_message=f\"Upload {config}\",\n",
- " )\n",
- "\n",
- "\n",
- "def normalize_number_str(number_str: str) -> float:\n",
- " # we replace these common units and commas to allow\n",
- " # conversion to float\n",
- " for char in [\"$\", \"%\", \",\"]:\n",
- " number_str = number_str.replace(char, \"\")\n",
- " try:\n",
- " return float(number_str)\n",
- " except ValueError:\n",
- " return float(\"inf\")\n",
- "\n",
- "\n",
- "def split_string(\n",
- " s: str,\n",
- " char_list: list[str] = [\",\", \";\"],\n",
- ") -> list[str]:\n",
- " pattern = f\"[{''.join(char_list)}]\"\n",
- " return re.split(pattern, s)\n",
- "\n",
- "\n",
- "def is_float(element: any) -> bool:\n",
- " try:\n",
- " float(element)\n",
- " return True\n",
- " except ValueError:\n",
- " return False\n",
- "\n",
- "\n",
- "def normalize_str(input_str, remove_punct=True) -> str:\n",
- " \"\"\"\n",
- " Normalize a string by:\n",
- " - Removing all white spaces\n",
- " - Optionally removing punctuation (if remove_punct is True)\n",
- " - Converting to lowercase\n",
- " Parameters:\n",
- " - input_str: str, the string to normalize\n",
- " - remove_punct: bool, whether to remove punctuation (default: True)\n",
- " Returns:\n",
- " - str, the normalized string\n",
- " \"\"\"\n",
- " # Remove all white spaces. Required e.g for seagull vs. sea gull\n",
- " no_spaces = re.sub(r\"\\s\", \"\", input_str)\n",
- "\n",
- " # Remove punctuation, if specified.\n",
- " if remove_punct:\n",
- " translator = str.maketrans(\"\", \"\", string.punctuation)\n",
- " return no_spaces.lower().translate(translator)\n",
- " else:\n",
- " return no_spaces.lower()\n",
- "\n",
- "\n",
- "def extract_numbers(text: str) -> List[str]:\n",
- " \"\"\"This pattern matches:\n",
- " - Optional negative sign\n",
- " - Numbers with optional comma thousand separators\n",
- " - Optional decimal points with decimal numbers\n",
- " \"\"\"\n",
- " pattern = r\"-?(?:\\d{1,3}(?:,\\d{3})+|\\d+)(?:\\.\\d+)?\"\n",
- "\n",
- " return [el.replace(\",\", \"\") for el in re.findall(pattern, text)]\n",
- "\n",
- "\n",
- "def get_question_score_gaia(\n",
- " model_answer: str,\n",
- " ground_truth: str,\n",
- ") -> bool:\n",
- " \"\"\"Scoring function used to score functions from the GAIA benchmark\"\"\"\n",
- " if is_float(ground_truth):\n",
- " normalized_answer = normalize_number_str(str(model_answer))\n",
- " return normalized_answer == float(ground_truth)\n",
- "\n",
- " elif any(char in ground_truth for char in [\",\", \";\"]): # if gt is a list\n",
- " # question with the fish: normalization removes punct\n",
- " gt_elems = split_string(ground_truth)\n",
- " ma_elems = split_string(model_answer)\n",
- "\n",
- " if len(gt_elems) != len(ma_elems): # check length is the same\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- " return False\n",
- "\n",
- " comparisons = []\n",
- " for ma_elem, gt_elem in zip(ma_elems, gt_elems): # compare each element as float or str\n",
- " if is_float(gt_elem):\n",
- " normalized_ma_elem = normalize_number_str(ma_elem)\n",
- " comparisons.append(normalized_ma_elem == float(gt_elem))\n",
- " else:\n",
- " # we do not remove punct since comparisons can include punct\n",
- " comparisons.append(\n",
- " normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)\n",
- " )\n",
- " return all(comparisons)\n",
- "\n",
- " else: # if gt is a str\n",
- " return normalize_str(model_answer) == normalize_str(ground_truth)\n",
- "\n",
- "\n",
- "def get_correct(row):\n",
- " if row[\"source\"] == \"MATH\": # Checks the last number in answer\n",
- " numbers_answer = extract_numbers(str(row[\"answer\"]))\n",
- " if len(numbers_answer) == 0:\n",
- " return False\n",
- " return float(numbers_answer[-1]) == float(row[\"true_answer\"])\n",
- " else:\n",
- " return get_question_score_gaia(str(row[\"answer\"]), str(row[\"true_answer\"]))\n",
- "\n",
- "\n",
- "def score_answers(\n",
- " answers_subsets,\n",
- " answers_dataset=ANSWERS_DATASET,\n",
- " date=DATE,\n",
- " push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n",
- " set_default=True,\n",
- "):\n",
- " if not answers_dataset:\n",
- " raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n",
- " date = date or datetime.date.today().isoformat()\n",
- " results = []\n",
- " for answers_subset in answers_subsets:\n",
- " *model_id, action_type, task = answers_subset.split(\"__\")\n",
- " model_id = \"/\".join(model_id)\n",
- " ds = datasets.load_dataset(answers_dataset, answers_subset, split=\"test\")\n",
- " df = ds.to_pandas()\n",
- " df[\"correct\"] = df.apply(get_correct, axis=1)\n",
- " acc = df[\"correct\"].mean().item()\n",
- " result = df.loc[0, [\"model_id\", \"agent_action_type\", \"source\"]].to_dict()\n",
- " result[\"acc\"] = acc\n",
- " results.append(result)\n",
- " df = pd.DataFrame(results)\n",
- "\n",
- " if push_to_hub_dataset:\n",
- " ds = datasets.Dataset.from_pandas(df)\n",
- " config = date\n",
- " set_default = set_default\n",
- " ds.push_to_hub(\n",
- " push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n",
- " )\n",
- " return df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Evaluation dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['gaia', 'math', 'simpleqa']\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
question
\n",
- "
source
\n",
- "
true_answer
\n",
- "
true_reasoning
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
0
\n",
- "
What year was the municipality of Ramiriquรญ, B...
\n",
- "
SimpleQA
\n",
- "
1541
\n",
- "
['https://en.wikipedia.org/wiki/Ramiriqu%C3%AD...
\n",
- "
\n",
- "
\n",
- "
1
\n",
- "
In what year did Hjalmar Hvam invent a mechani...
\n",
- "
SimpleQA
\n",
- "
1937
\n",
- "
['https://www.kgw.com/article/features/portlan...
\n",
- "
\n",
- "
\n",
- "
2
\n",
- "
In which year did Fayaz A. Malik (an Indian ph...
\n",
- "
SimpleQA
\n",
- "
2009
\n",
- "
['https://en.wikipedia.org/wiki/Fayaz_A._Malik...
\n",
- "
\n",
- "
\n",
- "
3
\n",
- "
In which year was John B. Goodenough elected a...
\n",
- "
SimpleQA
\n",
- "
2010
\n",
- "
['https://en.wikipedia.org/wiki/John_B._Gooden...
\n",
- "
\n",
- "
\n",
- "
4
\n",
- "
In which year did Atul Gawande earn an M.A. in...
\n",
- "
SimpleQA
\n",
- "
1989
\n",
- "
['https://en.wikipedia.org/wiki/Atul_Gawande',...
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " question source true_answer \\\n",
- "0 What year was the municipality of Ramiriquรญ, B... SimpleQA 1541 \n",
- "1 In what year did Hjalmar Hvam invent a mechani... SimpleQA 1937 \n",
- "2 In which year did Fayaz A. Malik (an Indian ph... SimpleQA 2009 \n",
- "3 In which year was John B. Goodenough elected a... SimpleQA 2010 \n",
- "4 In which year did Atul Gawande earn an M.A. in... SimpleQA 1989 \n",
- "\n",
- " true_reasoning \n",
- "0 ['https://en.wikipedia.org/wiki/Ramiriqu%C3%AD... \n",
- "1 ['https://www.kgw.com/article/features/portlan... \n",
- "2 ['https://en.wikipedia.org/wiki/Fayaz_A._Malik... \n",
- "3 ['https://en.wikipedia.org/wiki/John_B._Gooden... \n",
- "4 ['https://en.wikipedia.org/wiki/Atul_Gawande',... "
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "\n",
- "# Choose the tasks to evaluate on:\n",
- "# tasks = [\"gaia\"]\n",
- "# or evaluate on all tasks: [\"gaia\", \"math\", \"simpleqa\"]\n",
- "tasks = datasets.get_dataset_config_names(EVAL_DATASET)\n",
- "print(tasks)\n",
- "\n",
- "\n",
- "eval_ds = {task: datasets.load_dataset(EVAL_DATASET, task, split=\"test\") for task in tasks}\n",
- "pd.DataFrame(eval_ds[\"simpleqa\"]).head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Benchmark agents\n",
- "\n",
- "### Open models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "open_model_ids = [\n",
- " \"meta-llama/Llama-3.3-70B-Instruct\",\n",
- " # \"Qwen/QwQ-32B-Preview\",\n",
- " \"Qwen/Qwen2.5-72B-Instruct\",\n",
- " \"Qwen/Qwen2.5-Coder-32B-Instruct\",\n",
- " \"meta-llama/Llama-3.2-3B-Instruct\",\n",
- " \"meta-llama/Llama-3.1-8B-Instruct\",\n",
- " \"mistralai/Mistral-Nemo-Instruct-2407\",\n",
- " # \"HuggingFaceTB/SmolLM2-1.7B-Instruct\",\n",
- " # \"meta-llama/Llama-3.1-70B-Instruct\",\n",
- "]\n",
- "\n",
- "\n",
- "for model_id in open_model_ids:\n",
- " print(f\"Evaluating '{model_id}'...\")\n",
- " # action_type = \"tool-calling\"\n",
- " # agent = ToolCallingAgent(\n",
- " # tools=[GoogleSearchTool(), VisitWebpageTool(), PythonInterpreterTool()],\n",
- " # model=HfApiModel(model_id),\n",
- " # max_steps=10,\n",
- " # )\n",
- " # answer_questions(eval_ds, agent, model_id, action_type)\n",
- "\n",
- " action_type = \"code\"\n",
- " agent = CodeAgent(\n",
- " tools=[GoogleSearchTool(), VisitWebpageTool()],\n",
- " model=HfApiModel(model_id),\n",
- " additional_authorized_imports=[\"numpy\", \"sympy\"],\n",
- " max_steps=10,\n",
- " )\n",
- " answer_questions(eval_ds, agent, model_id, action_type)\n",
- "\n",
- " # Also evaluate vanilla model\n",
- " action_type = \"vanilla\"\n",
- " llm = HfApiModel(model_id)\n",
- " answer_questions(eval_ds, llm, model_id, action_type, is_vanilla_llm=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Closed models"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from smolagents import LiteLLMModel\n",
- "\n",
- "\n",
- "litellm_model_ids = [\"gpt-4o\", \"anthropic/claude-3-5-sonnet-latest\"]\n",
- "\n",
- "\n",
- "for model_id in litellm_model_ids:\n",
- " print(f\"Evaluating '{model_id}'...\")\n",
- " action_type = \"tool-calling\"\n",
- " agent = ToolCallingAgent(\n",
- " tools=[\n",
- " GoogleSearchTool(),\n",
- " VisitWebpageTool(),\n",
- " PythonInterpreterTool([\"numpy\", \"sympy\"]),\n",
- " ],\n",
- " model=LiteLLMModel(model_id),\n",
- " max_steps=10,\n",
- " )\n",
- " answer_questions(eval_ds, agent, model_id, action_type)\n",
- "\n",
- " action_type = \"code\"\n",
- " agent = CodeAgent(\n",
- " tools=[GoogleSearchTool(), VisitWebpageTool()],\n",
- " model=LiteLLMModel(model_id),\n",
- " additional_authorized_imports=[\"numpy\", \"sympy\"],\n",
- " max_steps=10,\n",
- " )\n",
- " answer_questions(eval_ds, agent, model_id, action_type)\n",
- "\n",
- " # Also evaluate vanilla model\n",
- " action_type = \"vanilla\"\n",
- " llm = LiteLLMModel(model_id)\n",
- " answer_questions(eval_ds, llm, model_id, action_type, is_vanilla_llm=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# import glob\n",
- "# import json\n",
- "\n",
- "# jsonl_files = glob.glob(f\"output/*.jsonl\")\n",
- "\n",
- "# for file_path in jsonl_files:\n",
- "# if \"-Nemo-\" in file_path and \"-vanilla-\" in file_path:\n",
- "# print(file_path)\n",
- "# # Read all lines and filter out SimpleQA sources\n",
- "# filtered_lines = []\n",
- "# removed = 0\n",
- "# with open(file_path, \"r\", encoding=\"utf-8\") as f:\n",
- "# for line in f:\n",
- "# try:\n",
- "# data = json.loads(line.strip())\n",
- "# data[\"answer\"] = data[\"answer\"][\"content\"]\n",
- "# # if not any([question in data[\"question\"] for question in eval_ds[\"question\"]]):\n",
- "# # removed +=1\n",
- "# # else:\n",
- "# filtered_lines.append(json.dumps(data) + \"\\n\")\n",
- "# except json.JSONDecodeError:\n",
- "# print(\"Invalid line:\", line)\n",
- "# continue # Skip invalid JSON lines\n",
- "# print(f\"Removed {removed} lines.\")\n",
- "# # Write filtered content back to the same file\n",
- "# with open(\n",
- "# str(file_path).replace(\"-vanilla-\", \"-vanilla2-\"), \"w\", encoding=\"utf-8\"\n",
- "# ) as f:\n",
- "# f.writelines(filtered_lines)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Score answers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Number of answers_subsets 54\n",
- "Example of answers_subset Qwen__Qwen2.5-72B-Instruct__code__gaia\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
- "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "ename": "",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
- "\u001b[1;31mView Jupyter log for further details."
- ]
- }
- ],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "from matplotlib.legend_handler import HandlerTuple # Added import\n",
- "\n",
- "\n",
- "# Assuming pivot_df is your original dataframe\n",
- "models = pivot_df[\"model_id\"].unique()\n",
- "sources = pivot_df[\"source\"].unique()\n",
- "\n",
- "# Create figure and axis\n",
- "plt.style.use(\"seaborn-v0_8-white\")\n",
- "fig, ax = plt.subplots(figsize=(15, 6))\n",
- "\n",
- "# Set the width of each bar group and positions of the bars\n",
- "width = 0.15 # width of each bar\n",
- "spacing = 0.02 # space between bars within a group\n",
- "group_spacing = 0.2 # space between model groups\n",
- "\n",
- "# Calculate positions for the bars\n",
- "num_sources = len(sources)\n",
- "total_width_per_group = (width + spacing) * num_sources * 2 # *2 for agent and vanilla\n",
- "x = np.arange(len(models)) * (total_width_per_group + group_spacing)\n",
- "\n",
- "# Plot bars for each source\n",
- "for i, source in enumerate(sources):\n",
- " source_data = pivot_df[pivot_df[\"source\"] == source]\n",
- " agent_scores = [\n",
- " source_data[source_data[\"model_id\"] == model][\"code\"].values[0]\n",
- " if len(source_data[source_data[\"model_id\"] == model]) > 0\n",
- " else np.nan\n",
- " for model in models\n",
- " ]\n",
- " vanilla_scores = [\n",
- " source_data[source_data[\"model_id\"] == model][\"vanilla\"].values[0]\n",
- " if len(source_data[source_data[\"model_id\"] == model]) > 0\n",
- " else np.nan\n",
- " for model in models\n",
- " ]\n",
- "\n",
- " # Position calculation for each pair of bars\n",
- " pos = x + i * (width * 2 + spacing)\n",
- "\n",
- " agent_bars = ax.bar(pos, agent_scores, width, label=f\"{source} (Agent)\", alpha=0.8)\n",
- " vanilla_bars = ax.bar(\n",
- " pos + width * 0.6,\n",
- " vanilla_scores,\n",
- " width,\n",
- " hatch=\"////\",\n",
- " alpha=0.5,\n",
- " hatch_linewidth=2,\n",
- " label=f\"{source} (Vanilla)\",\n",
- " color=\"white\",\n",
- " edgecolor=agent_bars[0].get_facecolor(),\n",
- " )\n",
- "\n",
- "# Customize the plot\n",
- "ax.set_ylabel(\"Score\")\n",
- "ax.set_title(\"Model Performance Comparison\")\n",
- "\n",
- "# Set x-axis ticks in the middle of each group\n",
- "group_centers = x + (total_width_per_group - spacing) / 2\n",
- "ax.set_xticks(group_centers)\n",
- "\n",
- "# Wrap long model names to prevent overlap\n",
- "wrapped_labels = [\"\\n\".join(model.split(\"/\")) for model in models]\n",
- "ax.set_xticklabels(wrapped_labels, rotation=0, ha=\"center\")\n",
- "\n",
- "# Modify legend to combine agent and vanilla entries\n",
- "handles, labels = ax.get_legend_handles_labels()\n",
- "unique_sources = sources\n",
- "legend_elements = [\n",
- " (handles[i * 2], handles[i * 2 + 1], labels[i * 2].replace(\" (Agent)\", \"\")) for i in range(len(unique_sources))\n",
- "]\n",
- "custom_legend = ax.legend(\n",
- " [(agent_handle, vanilla_handle) for agent_handle, vanilla_handle, _ in legend_elements],\n",
- " [label for _, _, label in legend_elements],\n",
- " handler_map={tuple: HandlerTuple(ndivide=None)},\n",
- " bbox_to_anchor=(1.05, 1),\n",
- " loc=\"upper left\",\n",
- ")\n",
- "\n",
- "ax.yaxis.grid(True, linestyle=\"--\", alpha=0.3)\n",
- "ax.set_ylim(bottom=0)\n",
- "plt.tight_layout()\n",
- "ax.spines[\"top\"].set_visible(False)\n",
- "ax.spines[\"right\"].set_visible(False)\n",
- "\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "ename": "NameError",
- "evalue": "name 'formatted_df' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[12], line 45\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m mathjax_table\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# Usage (after running your previous data processing code):\u001b[39;00m\n\u001b[0;32m---> 45\u001b[0m mathjax_table \u001b[38;5;241m=\u001b[39m create_mathjax_table(pivot_df, \u001b[43mformatted_df\u001b[49m)\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28mprint\u001b[39m(mathjax_table)\n",
- "\u001b[0;31mNameError\u001b[0m: name 'formatted_df' is not defined"
- ]
- }
- ],
- "source": [
- "def create_mathjax_table(pivot_df, formatted_df):\n",
- " # Start the matrix environment with 4 columns\n",
- " # l for left-aligned model and task, c for centered numbers\n",
- " mathjax_table = \"\\\\begin{array}{llcc}\\n\"\n",
- " mathjax_table += \"\\\\text{Model} & \\\\text{Task} & \\\\text{Agent} & \\\\text{Vanilla} \\\\\\\\\\n\"\n",
- " mathjax_table += \"\\\\hline\\n\"\n",
- "\n",
- " # Sort the DataFrame by model_id and source\n",
- " formatted_df = formatted_df.sort_values([\"model_id\", \"source\"])\n",
- "\n",
- " current_model = None\n",
- " for _, row in formatted_df.iterrows():\n",
- " model = row[\"model_id\"]\n",
- " source = row[\"source\"]\n",
- "\n",
- " # Add a horizontal line between different models\n",
- " if current_model is not None and current_model != model:\n",
- " mathjax_table += \"\\\\hline\\n\"\n",
- "\n",
- " # Format model name\n",
- " model_display = model.replace(\"_\", \"\\\\_\")\n",
- " if \"Qwen\" in model or \"anthropic\" in model:\n",
- " model_display = f\"\\\\textit{{{model_display}}}\"\n",
- "\n",
- " # If it's the same model as previous row, use empty space\n",
- " if current_model == model:\n",
- " model_display = \"\\\\;\"\n",
- "\n",
- " # Add the data row\n",
- " mathjax_table += f\"{model_display} & {source} & {row['agent']} & {row['vanilla']} \\\\\\\\\\n\"\n",
- "\n",
- " current_model = model\n",
- "\n",
- " mathjax_table += \"\\\\hline\\n\"\n",
- " mathjax_table += \"\\\\end{array}\"\n",
- "\n",
- " return mathjax_table\n",
- "\n",
- "\n",
- "# Usage (after running your previous data processing code):\n",
- "# mathjax_table = create_mathjax_table(pivot_df, formatted_df)\n",
- "# print(mathjax_table)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "test",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/e2b_example.py b/examples/e2b_example.py
deleted file mode 100644
index 18354a372..000000000
--- a/examples/e2b_example.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from dotenv import load_dotenv
-
-from smolagents import CodeAgent, HfApiModel, Tool
-from smolagents.default_tools import VisitWebpageTool
-
-
-load_dotenv()
-
-
-class GetCatImageTool(Tool):
- name = "get_cat_image"
- description = "Get a cat image"
- inputs = {}
- output_type = "image"
-
- def __init__(self):
- super().__init__()
- self.url = "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png"
-
- def forward(self):
- from io import BytesIO
-
- import requests
- from PIL import Image
-
- response = requests.get(self.url)
-
- return Image.open(BytesIO(response.content))
-
-
-get_cat_image = GetCatImageTool()
-
-agent = CodeAgent(
- tools=[get_cat_image, VisitWebpageTool()],
- model=HfApiModel(),
- additional_authorized_imports=[
- "Pillow",
- "requests",
- "markdownify",
- ], # "duckduckgo-search",
- use_e2b_executor=True,
-)
-
-agent.run(
- "Calculate how much is 2+2, then return me an image of a cat. Directly use the image provided in your state.",
- additional_args={"cat_image": get_cat_image()},
-) # Asking to directly return the image from state tests that additional_args are properly sent to server.
-
-# Try the agent in a Gradio UI
-from smolagents import GradioUI
-
-
-GradioUI(agent).launch()
diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py
new file mode 100644
index 000000000..81c56a1f2
--- /dev/null
+++ b/examples/gradio_ui.py
@@ -0,0 +1,25 @@
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+from smolagents import CodeAgent, GradioUI, InferenceClientModel
+
+
+def add_agent_image(memory_step, agent):
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/smolagents.png"
+ response = requests.get(url)
+ memory_step.observations_images = [Image.open(BytesIO(response.content))]
+
+
+agent = CodeAgent(
+ tools=[],
+ model=InferenceClientModel(),
+ verbosity_level=1,
+ planning_interval=3,
+ name="example_agent",
+ description="This is an example agent that has not tool but will always see an agent at the end of its step.",
+ step_callbacks=[add_agent_image],
+)
+
+GradioUI(agent, file_upload_folder="./data").launch()
diff --git a/examples/gradio_upload.py b/examples/gradio_upload.py
deleted file mode 100644
index 746013627..000000000
--- a/examples/gradio_upload.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from smolagents import CodeAgent, GradioUI, HfApiModel
-
-
-agent = CodeAgent(tools=[], model=HfApiModel(), max_steps=4, verbosity_level=1)
-
-GradioUI(agent, file_upload_folder="./data").launch()
diff --git a/examples/inspect_multiagent_run.py b/examples/inspect_multiagent_run.py
index 8c1c98d46..95032cd34 100644
--- a/examples/inspect_multiagent_run.py
+++ b/examples/inspect_multiagent_run.py
@@ -9,14 +9,14 @@
from smolagents import (
CodeAgent,
DuckDuckGoSearchTool,
- HfApiModel,
+ InferenceClientModel,
ToolCallingAgent,
VisitWebpageTool,
)
# Then we run the agentic part!
-model = HfApiModel()
+model = InferenceClientModel()
search_agent = ToolCallingAgent(
tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
diff --git a/examples/multi_llm_agent.py b/examples/multi_llm_agent.py
new file mode 100644
index 000000000..186fa06f8
--- /dev/null
+++ b/examples/multi_llm_agent.py
@@ -0,0 +1,47 @@
+import os
+
+from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMRouterModel
+
+
+os.environ["OPENAI_API_KEY"] = ""
+os.environ["AWS_ACCESS_KEY_ID"] = ""
+os.environ["AWS_SECRET_ACCESS_KEY"] = ""
+os.environ["AWS_REGION"] = ""
+
+llm_loadbalancer_model_list = [
+ {
+ "model_name": "model-group-1",
+ "litellm_params": {
+ "model": "gpt-4o-mini",
+ "api_key": os.getenv("OPENAI_API_KEY"),
+ },
+ },
+ {
+ "model_name": "model-group-1",
+ "litellm_params": {
+ "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+ "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
+ "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+ "aws_region_name": os.getenv("AWS_REGION"),
+ },
+ },
+ # {
+ # "model_name": "model-group-2",
+ # "litellm_params": {
+ # "model": "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+ # "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
+ # "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
+ # "aws_region_name": os.getenv("AWS_REGION"),
+ # },
+ # },
+]
+
+
+model = LiteLLMRouterModel(
+ model_id="model-group-1",
+ model_list=llm_loadbalancer_model_list,
+ client_kwargs={"routing_strategy": "simple-shuffle"},
+)
+agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model)
+
+agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")
diff --git a/examples/multiple_tools.py b/examples/multiple_tools.py
index 39ed90767..a2685541f 100644
--- a/examples/multiple_tools.py
+++ b/examples/multiple_tools.py
@@ -1,13 +1,11 @@
-from typing import Optional
-
import requests
# from smolagents.agents import ToolCallingAgent
-from smolagents import CodeAgent, HfApiModel, tool
+from smolagents import CodeAgent, InferenceClientModel, tool
# Choose which LLM engine to use!
-model = HfApiModel()
+model = InferenceClientModel()
# model = TransformersModel(model_id="meta-llama/Llama-3.2-2B-Instruct")
# For anthropic: change model_id below to 'anthropic/claude-3-5-sonnet-20240620'
@@ -15,7 +13,7 @@
@tool
-def get_weather(location: str, celsius: Optional[bool] = False) -> str:
+def get_weather(location: str, celsius: bool | None = False) -> str:
"""
Get the current weather at the given location using the WeatherStack API.
diff --git a/examples/open_deep_research/README.md b/examples/open_deep_research/README.md
index 915bfc894..c2c799616 100644
--- a/examples/open_deep_research/README.md
+++ b/examples/open_deep_research/README.md
@@ -1,22 +1,54 @@
# Open Deep Research
-Welcome to this open replication of [OpenAI's Deep Research](https://openai.com/index/introducing-deep-research/)!
+Welcome to this open replication of [OpenAI's Deep Research](https://openai.com/index/introducing-deep-research/)! This agent attempts to replicate OpenAI's model and achieve similar performance on research tasks.
-Read more about this implementation's goal and methods [in our blog post](https://huggingface.co/blog/open-deep-research).
+Read more about this implementation's goal and methods in our [blog post](https://huggingface.co/blog/open-deep-research).
-This agent achieves 55% pass@1 on GAIA validation set, vs 67% for Deep Research.
-To install it, first run
+This agent achieves **55% pass@1** on the GAIA validation set, compared to **67%** for the original Deep Research.
+
+## Setup
+
+To get started, follow the steps below:
+
+### Clone the repository
+
+```bash
+git clone https://github.com/huggingface/smolagents.git
+cd smolagents/examples/open_deep_research
+```
+
+### Install dependencies
+
+Run the following command to install the required dependencies from the `requirements.txt` file:
+
```bash
pip install -r requirements.txt
```
-And install smolagents dev version
+### Install the development version of `smolagents`
+
```bash
-pip install smolagents[dev]
+pip install -e ../../.[dev]
```
+### Set up environment variables
+
+The agent uses the `GoogleSearchTool` for web search, which requires an environment variable with the corresponding API key, based on the selected provider:
+- `SERPAPI_API_KEY` for SerpApi: [Sign up here to get a key](https://serpapi.com/users/sign_up)
+- `SERPER_API_KEY` for Serper: [Sign up here to get a key](https://serper.dev/signup)
+
+Depending on the model you want to use, you may need to set environment variables.
+For example, to use the default `o1` model, you need to set the `OPENAI_API_KEY` environment variable.
+[Sign up here to get a key](https://platform.openai.com/signup).
+
+> [!WARNING]
+> The use of the default `o1` model is restricted to tier-3 access: https://help.openai.com/en/articles/10362446-api-access-to-o1-and-o3-mini
+
+
+## Usage
+
Then you're good to go! Run the run.py script, as in:
```bash
python run.py --model-id "o1" "Your question here!"
-```
+```
\ No newline at end of file
diff --git a/examples/open_deep_research/analysis.ipynb b/examples/open_deep_research/analysis.ipynb
index 04f315fdd..ccb6a1d54 100644
--- a/examples/open_deep_research/analysis.ipynb
+++ b/examples/open_deep_research/analysis.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -11,19 +11,9 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/aymeric/venv/gaia/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n",
- "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import os\n",
"\n",
@@ -38,12 +28,12 @@
"\n",
"pd.set_option(\"max_colwidth\", None)\n",
"\n",
- "OUTPUT_DIR = \"output\""
+ "OUTPUT_DIR = \"../../output\""
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -52,29 +42,6 @@
"eval_df = pd.DataFrame(eval_ds)"
]
},
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "2 86\n",
- "1 53\n",
- "3 26\n",
- "Name: count, dtype: int64"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pd.Series(eval_ds[\"task\"]).value_counts()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -84,7 +51,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -98,254 +65,14 @@
" results.append(df)\n",
"\n",
"result_df = pd.concat(results)\n",
- "result_df = result_df.drop(columns=[\"start_time\", \"end_time\"])\n",
"result_df[\"prediction\"] = result_df[\"prediction\"].fillna(\"No prediction\")"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "String cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 2 High fantasy A Song of Ice and Fire cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String 94 CFM for Cheater cannot be normalized to number str.\n",
- "String 93 CFM for Cheater beater cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 3 or 4 cannot be normalized to number str.\n",
- "String No year cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 250 for Cheater cannot be normalized to number str.\n",
- "String 220 for Cheater beater cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String 776 ft/min for Cheater cannot be normalized to number str.\n",
- "String 768 ft/min for Cheater beater cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String CFM number for Cheater: not listed cannot be normalized to number str.\n",
- "String CFM number for Cheater beater: 665 ft/min cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 1.46 ร cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String August 1: 0 August 2: 0 August 3: 0 August 4: 0 August 5: 0 August 6: 0 August 7: 0 August 8: 0 August 9: 0 August 10: 0 August 11: 0 August 12: 0 August 13: 0 August 14: 0 August 15: 0 August 16: 0 August 17: 0 August 18: 0 August 19: 0 August 20: 0 August 21: 0 August 22: 0 August 23: 0 August 24: 0 August 25: 0 August 26: 0 August 27: 0 August 28: 0 August 29: 0 August 30: 0 August 31: 0 cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String 120 for Cheater cannot be normalized to number str.\n",
- "String 103 for Cheater beater cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 120.28 for Cheater cannot be normalized to number str.\n",
- "String 119.04 for Cheater beater cannot be normalized to number str.\n",
- "String 3 or 4 cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 2730-2740 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 89706.00 USD cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String No prediction cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 6 The Lord of the Rings (book) J. R. R. Tolkien Author American literature Fantasy literature Publishers A Song of Ice and Fire cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 1.46 ร cannot be normalized to number str.\n",
- "String cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 94.5 for Cheater cannot be normalized to number str.\n",
- "String 93.5 for Cheater beater cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 776 for Cheater cannot be normalized to number str.\n",
- "String Not specified for Cheater Beater cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 5.75 for Cheater cannot be normalized to number str.\n",
- "String 5.22 for Cheater Beater cannot be normalized to number str.\n",
- "String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String 33101 28557 cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "String Unable to determine cannot be normalized to number str.\n",
- "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
- "Close call: Rockhopper Penguins vs Rockhopper penguin\n",
- "Close call: INT. THE CASTLE vs THE CASTLE\n",
- "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
- "Close call: The World of the Twenty First Century 1994 vs The World of the Twenty First Century\n",
- "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
- "Close call: Wes Craven's A Nightmare on Elm Street vs A Nightmare on Elm Street\n",
- "Close call: God said let there be dragons vs Here be dragons\n",
- "Close call: rockhopper penguins vs Rockhopper penguin\n",
- "Close call: Harbinger, This Fire, Tidal vs Harbinger, Tidal\n",
- "Close call: EC 3.1.3.1;EC 1.11.1.7 vs 3.1.3.1; 1.11.1.7\n",
- "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
- "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n",
- "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n",
- "Close call: Out of the Silent Planet by C.S. Lewis vs Out of the Silent Planet\n",
- "Close call: broccoli, celery, fresh basil, green beans, lettuce, sweet potatoes vs broccoli, celery, fresh basil, lettuce, sweet potatoes\n",
- "Close call: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/aymeric/Documents/Code/smolagents/examples/open_deep_research/scripts/gaia_scorer.py:52: UserWarning: Answer lists have different lengths, returning False.\n",
- " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import re\n",
"from collections import Counter\n",
@@ -395,12 +122,21 @@
" return total_count\n",
"\n",
"\n",
+ "def get_durations(row):\n",
+ " # start_datetime = datetime.strptime(row['start_time'], \"%Y-%m-%d %H:%M:%S\")\n",
+ " # end_datetime = datetime.strptime(row['end_time'], \"%Y-%m-%d %H:%M:%S\")\n",
+ "\n",
+ " duration_timedelta = row[\"end_time\"] - row[\"start_time\"]\n",
+ " return int(duration_timedelta.total_seconds())\n",
+ "\n",
+ "\n",
+ "result_df[\"duration\"] = result_df.apply(get_durations, axis=1)\n",
"# result_df[\"tool_calls\"] = result_df[\"intermediate_steps\"].apply(sum_tool_calls)"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -425,43 +161,9 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "agent_name\n",
- "code_gpt4o_03_february_text 165\n",
- "code_o1_03_february_ablation-toolcalling-manager 165\n",
- "code_o1_01_february_text 165\n",
- "code_o3-mini_03_february_remove-navigational 165\n",
- "code_o1_04_february_submission5 165\n",
- "code_o1_03_february_text_high-reasoning-effort 165\n",
- "code_o1_03_february_remove-navigational 164\n",
- "code_o1_03_february_fix-print-outputs 164\n",
- "code_o1_04_february_submission 162\n",
- "code_o1_03_february_goodoldtext-unbroken 161\n",
- "code_gpt4o_03_february_goodoldtext-unbroken 159\n",
- "code_gpt4o_03_february_magenticbrowser 159\n",
- "code_o1_03_february_fix-print-outputs2 156\n",
- "code_gpt4o_03_february_magenticbrowser2 156\n",
- "code_o1_04_february_submission-medium 125\n",
- "code_o1_29-01_text 105\n",
- "code_llama-3 90\n",
- "code_o1_22-01_managedagent-summary_planning 67\n",
- "code_o1_25-01_visioon 53\n",
- "code_o1_04_february_submission3 49\n",
- "code_qwen-coder-32B_03_february_text 43\n",
- "code_o1_04_february_submission4 6\n",
- "Name: count, dtype: int64"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"result_df[\"agent_name\"].value_counts()"
]
@@ -475,440 +177,37 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "agent_name\n",
- "code_gpt4o_03_february_text 165\n",
- "code_o1_03_february_ablation-toolcalling-manager 165\n",
- "code_o1_01_february_text 165\n",
- "code_o3-mini_03_february_remove-navigational 165\n",
- "code_o1_04_february_submission5 165\n",
- "code_o1_03_february_text_high-reasoning-effort 165\n",
- "code_o1_03_february_remove-navigational 164\n",
- "code_o1_03_february_fix-print-outputs 164\n",
- "code_o1_04_february_submission 162\n",
- "code_o1_03_february_goodoldtext-unbroken 161\n",
- "code_gpt4o_03_february_goodoldtext-unbroken 159\n",
- "code_gpt4o_03_february_magenticbrowser 159\n",
- "code_o1_03_february_fix-print-outputs2 156\n",
- "code_gpt4o_03_february_magenticbrowser2 156\n",
- "code_o1_04_february_submission-medium 125\n",
- "code_o1_29-01_text 105\n",
- "code_llama-3 90\n",
- "code_o1_22-01_managedagent-summary_planning 67\n",
- "code_o1_25-01_visioon 53\n",
- "code_o1_04_february_submission3 49\n",
- "code_qwen-coder-32B_03_february_text 43\n",
- "code_o1_04_february_submission4 6\n",
- "Name: count, dtype: int64"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "agent_name task\n",
- "code_gpt4o_03_february_goodoldtext-unbroken 2 84\n",
- " 1 53\n",
- " 3 22\n",
- "code_gpt4o_03_february_magenticbrowser 2 83\n",
- " 1 52\n",
- " ..\n",
- "code_o3-mini_03_february_remove-navigational 1 53\n",
- " 3 26\n",
- "code_qwen-coder-32B_03_february_text 2 22\n",
- " 1 14\n",
- " 3 7\n",
- "Name: count, Length: 65, dtype: int64"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Total length: 2809 - is complete: False\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "o1_vision = \"code_o1_25-01_visioon\"\n",
- "o1_next = \"code_o1_29-01_text\"\n",
- "o1 = \"code_o1_01_february_text\"\n",
- "\n",
- "list_versions = [o1, o1_vision, o1_next]\n",
- "\n",
- "# submission_selection_name = \"react_code_llama3-70b_02-05_full-gaia-validation-code\"\n",
"sel_df = result_df\n",
"# sel_df = sel_df.loc[\n",
"# (result_df[\"agent_name\"].isin(list_versions))\n",
- "# # & (~result_df[\"question\"].isin(UNSOLVED_QUESTIONS))\n",
"# ]\n",
"sel_df = sel_df.reset_index(drop=True)\n",
"display(sel_df[\"agent_name\"].value_counts())\n",
"sel_df = sel_df.drop_duplicates(subset=[\"agent_name\", \"question\"])\n",
"display(sel_df.groupby(\"agent_name\")[[\"task\"]].value_counts())\n",
- "print(\"Total length:\", len(sel_df), \"- is complete:\", len(sel_df) == 165)\n",
- "# assert sel_df[\"question\"].value_counts().max() == len(list_versions), \"Some questions are duplicate!\""
+ "print(\"Total length:\", len(sel_df), \"- is complete:\", len(sel_df) == 165)"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Average score:'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/html": [
- "
"
- ],
- "text/plain": [
- " is_correct \\\n",
- "agent_name task \n",
- "code_gpt4o_03_february_goodoldtext-unbroken 1 0.452830 \n",
- " 2 0.380952 \n",
- " 3 0.227273 \n",
- "code_gpt4o_03_february_magenticbrowser 1 0.480769 \n",
- " 2 0.349398 \n",
- "... ... \n",
- "code_o3-mini_03_february_remove-navigational 2 0.232558 \n",
- " 3 0.153846 \n",
- "code_qwen-coder-32B_03_february_text 1 0.357143 \n",
- " 2 0.136364 \n",
- " 3 0.142857 \n",
- "\n",
- " is_near_correct \\\n",
- "agent_name task \n",
- "code_gpt4o_03_february_goodoldtext-unbroken 1 0.452830 \n",
- " 2 0.392857 \n",
- " 3 0.227273 \n",
- "code_gpt4o_03_february_magenticbrowser 1 0.480769 \n",
- " 2 0.361446 \n",
- "... ... \n",
- "code_o3-mini_03_february_remove-navigational 2 0.244186 \n",
- " 3 0.153846 \n",
- "code_qwen-coder-32B_03_february_text 1 0.357143 \n",
- " 2 0.136364 \n",
- " 3 0.142857 \n",
- "\n",
- " count_steps count \n",
- "agent_name task \n",
- "code_gpt4o_03_february_goodoldtext-unbroken 1 7.000000 53 \n",
- " 2 8.511905 84 \n",
- " 3 10.409091 22 \n",
- "code_gpt4o_03_february_magenticbrowser 1 7.153846 52 \n",
- " 2 8.168675 83 \n",
- "... ... ... \n",
- "code_o3-mini_03_february_remove-navigational 2 4.976744 86 \n",
- " 3 6.615385 26 \n",
- "code_qwen-coder-32B_03_february_text 1 5.428571 14 \n",
- " 2 6.409091 22 \n",
- " 3 6.571429 7 \n",
- "\n",
- "[65 rows x 4 columns]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"display(\"Average score:\", sel_df.groupby(\"agent_name\")[[\"is_correct\"]].mean().round(3))\n",
"display(\n",
- " sel_df.groupby([\"agent_name\", \"task\"])[[\"is_correct\", \"is_near_correct\", \"count_steps\", \"question\"]]\n",
+ " sel_df.groupby([\"agent_name\", \"task\"])[[\"is_correct\", \"is_near_correct\", \"count_steps\", \"question\", \"duration\"]]\n",
" .agg(\n",
" {\n",
" \"is_correct\": \"mean\",\n",
" \"is_near_correct\": \"mean\",\n",
" \"count_steps\": \"mean\",\n",
" \"question\": \"count\",\n",
+ " \"duration\": \"mean\",\n",
" }\n",
" )\n",
" .rename(columns={\"question\": \"count\"})\n",
@@ -917,9851 +216,9 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.plotly.v1+json": {
- "config": {
- "plotlyServerURL": "https://plot.ly"
- },
- "data": [
- {
- "customdata": [
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ]
- ],
- "hovertemplate": "agent_name=code_gpt4o_03_february_goodoldtext-unbroken index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_gpt4o_03_february_goodoldtext-unbroken",
- "line": {
- "color": "#636efa",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_gpt4o_03_february_goodoldtext-unbroken",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4A",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoP1VVVVVVVeU/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkPzMzMzMzM+M/AAAAAAAA4j/T0tLS0tLiP3Icx3Ecx+E/bCivobyG4j+amZmZmZnhP5IkSZIkSeI/dNFFF1104T8LWchCFrLgP1VVVVVVVeE/7FG4HoXr4T+xEzuxEzvhP3sJ7SW0l+A/AAAAAAAA4D/d0wjLPY3gPxEREREREeE/hBBCCCGE4D8AAAAAAADgPwgffPDBB98/AAAAAAAA4D9f8RVf8RXfP47jOI7jON4/fdYNpshn3T8bymsor6HcP1y+5Vu+5ds/zczMzMzM3D8ZnI/B+RjcPz3P8zzP89w/EnfEHXFH3D+jiy666KLbPxzHcRzHcdw/05ve9KY33T94Nuo7G/XdP1VVVVVVVd0/L6fg5RS83D9xPQrXo3DdP93c3Nzc3Nw/7MRO7MRO3D8iNcF4K/vcPxPaS2gvod0/F1100UUX3T8lSZIkSZLcPx/BfQT3Edw/GmG5pxGW2z91Xx5bETTcP7y7u7u7u9s/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z9bqZVaqZXaPyebbLLJJts/NSbSA5Wz2z88PDw8PDzcP8y1A3PtwNw/fMVXfMVX3D8LmwOJVtjcPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbP08b6LSBTts/KK+hvIby2j++Y2pg75jaPxqkQRqkQdo/2TMQlY7s2T+amZmZmZnZP+Dp1vywSNk/+hicj8H52D+q82sPuazYP0mSJEmSJNk/2djY2NjY2D9T1pQ1ZU3ZPzv0m61Dv9k/L7rooosu2j+e8YxnPOPZP5qZmZmZmdk/WqAFWqAF2j+c3vSmN73ZP3bZZZdddtk/Z6O+s1Hf2T+amZmZmZnZPwAAAAAAANo/Grab5Ulk2j+IxvrQWB/aPywFav1Kgdo/PQrXo3A92j8ZvhEFJp3aP/v6+vr6+to/G0PTHey32j87sRM7sRPbP9u2bdu2bds/ln0OqQnG2z8T6J26loPbPya0l9BeQts/JrBpP1kC2z/D2jesfcPaP5ax/Y5eGds/27Zt27Zt2z80+bJBky/bPyivobyG8to/q8FzBIq22j8+jbDc0wjbP5u1WZu1Wds/BA0ndV8e2z+bCOSaCOTaPzMzMzMzM9s/hYn3I6f52j+f4pIhWEfbPw8b6bCRDts/W2uttdZa2z/ZzvdT46XbP/y+7/u+79s/7na73W632z8AAAAAAADcP/KGvCFvyNs/HLmRG7mR2z8j+oDq2FvbPyebbLLJJts/27Zt27Zt2z9YYyI9UDnbP1uwBVuwBds/09LS0tLS2j/TVwljs6DaP6c3velNb9o/D+jGPH202j87qIM6qIPaP2le/ImEU9o/gkQrbA4k2j9r/N08QvXZP3Icx3Ecx9k/mpmZmZmZ2T/Lli1btmzZP2x21CLkr9k/I591gyny2T9SkPx5lcXZP5qZmZmZmdk/y7hl3DJu2T82lNdQXkPZP9ouhNkuhNk/EWflJ8RZ2T8wWf6S5S/ZP2mQBmmQBtk/fo/ICcLd2D9rcRPmd7XYP3bpMX+vjdg/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Of the cities within the United States where U.S. "
- ]
- ],
- "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_gpt4o_03_february_magenticbrowser",
- "line": {
- "color": "#EF553B",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_gpt4o_03_february_magenticbrowser",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4A",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACamZmZmZnJP1VVVVVVVcU/kiRJkiRJwj8AAAAAAADAPxzHcRzHcbw/mpmZmZmZyT900UUXXXTRPwAAAAAAANA/FDuxEzux0z+3bdu2bdvWP1VVVVVVVdU/AAAAAAAA1D/T0tLS0tLSP3Icx3Ecx9E/eQ3lNZTX0D8AAAAAAADQP5IkSZIkSdI/dNFFF1100T84velNb3rTP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j8Jyz2NsNzTPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADUP1VVVVVVVdU/tbS0tLS01D/UQR3UQR3UP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP1VVVVVVVdU/ZmZmZmZm1j/blahdidrVP7dt27Zt29Y/lTVlTVlT1j9GF1100UXXPxdswRZswdY/etOb3vSm1z9dQUyuICbXPwAAAAAAANg/4eUUvJyC1z8K16NwPQrXP9jX19fX19c/J3ZiJ3Zi1z9ln0NqgvHWP0xoL6G9hNY/RhdddNFF1z+3bdu2bdvWP0xnMZ3FdNY/fBphuacR1j/QcFL35bHVP1VVVVVVVdU/yRCso8371D+ttdZaa63VP1VVVVVVVdU/AAAAAAAA1T/VSq3USq3UP1VVVVVVVdU/0gOVs1v41T+mpaWlpaXVP1VVVVVVVdU/Fl/xFV/x1T8g0QqbA4nWP47jOI7jONY/r169evXq1T/JZ91ginzWPwrXo3A9Ctc/ymsor6G81j8oxFn5CXHWP3ZiJ3ZiJ9Y/Xi1uwvyu1j9mZmZmZmbWP6QMPN2aH9Y/25WoXYna1T80dX7tIZfVP1VVVVVVVdU/FRUVFRUV1T82ZU1ZU9bUPy+QSfECmdQ/XXTRRRdd1D9CEYpQhCLUP5Q+6ZM+6dM/VEZlVEZl1D9DFrKQhSzUP6WUUkoppdQ/Ut/ZqO9s1D8mTv2eW+LUP1VVVVVVVdU/1g86KvDF1T9jfWisD43VP1DrVwrU+tU/w/UoXI/C1T+bB7nrZ4vVP/b19fX19dU/2xia7mC/1T+e2Imd2InVP1VVVVVVVdU/2eeQmmC81T/XcnCzX4jVP9FeQnsJ7dU//mQJbNpP1j8c1r5h7RvWP49eGdvv6NU/btu2bdu21T96amGlpxbWP0xnMZ3FdNY/bTV4jkDR1j9Y7mmE5Z7WP9ZmbdZmbdY/QcNJ3ZfH1j/XRCDXRCDXP3d3d3d3d9c/RhdddNFF1z8RrKPN+xTXP+UWT27x5NY/Ouecc8451z8K16NwPQrXP9d1Xdd1Xdc/7PV6vV6v1z8AAAAAAIDXP/QFfUFf0Nc/GHqhF3qh1z/f2jDNXfDXP8IHH3zwwdc/9oDZA2YP2D9JD1TObuHXP0J7Ce0ltNc/iIeHh4eH1z82C6o9J9PXP4K5dmCuHdg/6qPVJETx1z+ogzqogzrYP2C3x1qGDtg/Zfx2qSfj1z/MknJAZLjXP+Q4juM4jtc/J0p2baJk1z+6c+fOnTvXP+HlFLycgtc/n3WDKfJZ1z99GzBU0zHXP3d3d3d3d9c/uj5dn65P1z+H8hrKayjXP1esAVesAdc/t23btm3b1j+21lprrbXWPwdpkAZpkNY/dRhlKp5r1j9eLW7C/K7WP+EMCCV3itY/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ]
- ],
- "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser2 index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_gpt4o_03_february_magenticbrowser2",
- "line": {
- "color": "#00cc96",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_gpt4o_03_february_magenticbrowser2",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T+SJEmSJEniPxEREREREeE/AAAAAAAA4D8eHh4eHh7ePwAAAAAAAOA/DeU1lNdQ3j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP1VVVVVVVd0/KVyPwvUo3D87sRM7sRPbPy+hvYT2Eto/27Zt27Zt2z9huacRlnvaP5qZmZmZmdk/11prrbXW2j8AAAAAAADcPxdddNFFF90/PDw8PDw83D/btm3btm3bP6uqqqqqqto/0LrBFPms2z8or6G8hvLaPxqkQRqkQdo/mpmZmZmZ2T+J2pWoXYnaP9u2bdu2bds/EnfEHXFH3D8XXXTRRRfdPxzHcRzHcdw/velNb3rT2z9yBTG5gpjcP1VVVVVVVd0/g5dT8HIK3j9xPQrXo3DdP93c3Nzc3Nw/7MRO7MRO3D8iNcF4K/vcPxzHcRzHcdw/7RvWvmHt2z8lSZIkSZLcPx/BfQT3Edw/1AjLPY2w3D91Xx5bETTcP83MzMzMzNw/532KS4Zg3T/nnHPOOefcP13XdV3Xdd0/AAAAAAAA3T/dyI3cyI3cPxdddNFFF90/rDGRHqic3T8tLS0tLS3dP8y1A3PtwNw/fMVXfMVX3D8yfrvUk/HbPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbP08b6LSBTts/KK+hvIby2j/btm3btm3bP1y+5Vu+5ds/FzdhfleL2z8zMzMzMzPbP35YpAw83do/idqVqF2J2j/ksmKghDfaP3qe53me59k/mpmZmZmZ2T9T1pQ1ZU3ZPxUvkEnxAtk/dNFFF1102T+TlaxkJSvZP5qZmZmZmdk/GZVRGZVR2T+RhSxkIQvZP3bZZZdddtk/5QpicgUx2T+amZmZmZnZP1VVVVVVVdk/mYbtZnkS2T801ofG+tDYPzbZZJNNNtk/9ihcj8L12D+qeZC7frbYPxkZGRkZGdk/i/gEUsl52T+xEzuxEzvZP5qZmZmZmdk/fg6pCcZb2T/7hVhRGh/ZPzmO4ziO49g/koq51Rmp2D9wWPuGtW/YP6+M7Xf0ytg/JUmSJEmS2D/pqYWVnlrYPzqL6Syms9g/iHG/Lql82D/LPY2w3NPYP9mJndiJndg/6r48tiJo2D9YoTNYoTPYPwAAAAAAANg/Kky8HznN1z/jkiFYR5vXP2pXonYlatc/vvfee++91z+q8dJNYhDYP/h93/d939c/DAaDwWAw2D8AAAAAAADYPxT2hD1hT9g/+IEf+IEf2D/pA6pjb23YPz744IMPPtg/qYilIpaK2D8m0gOVs1vYP9iCLdiCLdg/AAAAAAAA2D9Q7TmZvkrYP4mfUeJnlNg/TGV71wHd2D/5iq/4iq/YP2JyBTG5gtg/0QqbA4lW2D/ZiZ3YiZ3YPzmO4ziO49g/0nmLIZ232D/EiBEjRozYPzTWh8b60Ng/YYp81g2m2D/oVRZntHvYP1K4HoXrUdg/waJgUbAo2D8AAAAAAADYP9jX19fX19c/1cDeMTWw1z+JV5F4FYnXPyd2Yid2Ytc/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ]
- ],
- "hovertemplate": "agent_name=code_gpt4o_03_february_text index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_gpt4o_03_february_text",
- "line": {
- "color": "#ab63fa",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_gpt4o_03_february_text",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgP97d3d3d3d0/AAAAAAAA3D9aWlpaWlraPzmO4ziO49g/KK+hvIby2j+amZmZmZnZP9u2bdu2bds/L7rooosu2j+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T/ZiZ3YiZ3YPy+hvYT2Eto/27Zt27Zt2z/UCMs9jbDcP7y7u7u7u9s/55xzzjnn3D8AAAAAAADcPxdddNFFF90/PDw8PDw83D8d1EEd1EHdP47jOI7jON4/KvJZN5gi3z8N5TWU11DeP9/yLd/yLd8/AAAAAAAA4D84H4PzMTjfPwAAAAAAAOA/0Bf0BX1B3z8AAAAAAADgP5/0SZ/0Sd8/6k1vetOb3j94Nuo7G/XdP1VVVVVVVd0/L6fg5RS83D8pXI/C9SjcP93c3Nzc3Nw/7MRO7MRO3D+WfQ6pCcbbPya0l9BeQts/7RvWvmHt2z8lSZIkSZLcPx/BfQT3Edw/1AjLPY2w3D91Xx5bETTcP7y7u7u7u9s/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z8cuZEbuZHbPx988MEHH9w/NSbSA5Wz2z9LS0tLS0vbP73pTW9609s/27Zt27Zt2z8yfrvUk/HbP+Q4juM4jts/2bJly5Yt2z/QusEU+azbP08b6LSBTts/KK+hvIby2j++Y2pg75jaPxqkQRqkQdo/2TMQlY7s2T9mZmZmZmbaPy+hvYT2Eto/idqVqF2J2j+CEt5o6vzaP6uqqqqqqto/WlpaWlpa2j+zpqwpa8raP2G5pxGWe9o/L7rooosu2j+e8YxnPOPZP/qkT/qkT9o/WqAFWqAF2j+c3vSmN73ZPyeaaKKJJto/Z6O+s1Hf2T+amZmZmZnZPwAAAAAAANo/Wp5EpmG72T+IxvrQWB/aPzFvZ0jM29k/mpmZmZmZ2T96kLt+tljZP7q5ubm5udk/i/gEUsl52T+KndiJndjZP5qZmZmZmdk/fg6pCcZb2T+B3qlrObjZP7SX0F5Ce9k/XJ2RirnV2T+amZmZmZnZP+mVsf2OXtk/btu2bdu22T+hyZcNmnzZPzGdxXQW09k/mpmZmZmZ2T+oEZZ7GmHZP1qbtVmbtdk/lLovj60I2j8arNAZrNDZPyIiIiIiIto/vB85zdfq2T/8FJcMwTraP4nalahdido/U0oppZRS2j/pJjEIrBzaP3qe53me59k/bTabzWaz2T8AAAAAAIDZP3PGnDFnzNk/mpmZmZmZ2T8GfxUnpOTZP7LJJptsstk/wp8Jfyb82T+/GhPpgcrZP5qZmZmZmdk/aWlpaWlp2T+fk+mrhLHZP6BR4meU+Nk/rSYhir/I2T+amZmZmZnZP2bogN0ea9k/FjYHEq2w2T/lgMhwr4LZP3Icx3Ecx9k/famg1ZcK2j/SpEmTJk3aP4jG+tBYH9o/DqbIZ91g2j8h+fMqizPaPwc6baDTBto/z2pntbPa2T/zGsprKK/ZP9ouhNkuhNk/EWflJ8RZ2T8wWf6S5S/ZP2mQBmmQBtk/fo/ICcLd2D9rcRPmd7XYP3bpMX+vjdg/ZmZmZmZm2D+vUkzQXaXYP5Ey8HRrftg/GFuCb/NX2D8yOB+D8zHYPwyYxoBpDNg/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "What is the area of the green polygon in the attac"
- ]
- ],
- "hovertemplate": "agent_name=code_llama-3 index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_llama-3",
- "line": {
- "color": "#FFA15A",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_llama-3",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZ",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEREREREbE/AAAAAAAAsD8eHh4eHh6uPxzHcRzHcaw/KK+hvIbyqj+amZmZmZmpPxiGYRiGYag/RhdddNFFpz9kIQtZyEKmP1VVVVVVVaU/exSuR+F6pD8UO7ETO7GjP2gvob2E9qI/kiRJkiRJoj+WexphuaehPxEREREREaE/hBBCCCGEoD8AAAAAAACgPwgffPDBB58/Hh4eHh4erj8d1EEd1EGtPxzHcRzHcaw/0LrBFPmsqz8or6G8hvKqPxqkQRqkQao/MzMzMzMzsz+7ErUrUbuyP5IkSZIkSbI/d8QdcUfcsT900UUXXXSxPxEREREREbE/ZCELWchCtj9XEJMriMm1P1VVVVVVVbU/OQUvp+DltD97FK5H4Xq0PxQUFBQUFLQ/FDuxEzuxsz/BeCv7HFKzP2gvob2E9rI/nhLkKUGesj+SJEmSJEmyP3AfwX0E97E/fBphuacRtj/QcFL35bG1P1VVVVVVVbU/yRCso837tD/GGGOMMca4PxiGYRiGYbg/AAAAAAAAuD8YeqEXeqG3P0YXXXTRRbc/jYn0QOXstj+XlpaWlpa2P2QhC1nIQrY/Fl/xFV/xtT9ItMLmQKK1P1VVVVVVVbU/qFChQoUKtT8cTJHPusG0P3sUrkfherQ/XkN5DeU1tD/Oyk+Is/KzP5dv+ZZv+bY/Xi1uwvyutj9mZmZmZma2P6QMPN2aH7Y/25WoXYnatT80dX7tIZe1P1VVVVVVVbU/FRUVFRUVtT82ZU1ZU9a0Py+QSfECmbQ/XXTRRRddtD9CEYpQhCK0P5Q+6ZM+6bM/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ]
- ],
- "hovertemplate": "agent_name=code_o1_01_february_text index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_01_february_text",
- "line": {
- "color": "#19d3f3",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_01_february_text",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA6D+amZmZmZnpP1VVVVVVVeU/t23btm3b5j8AAAAAAADoP1VVVVVVVeU/ZmZmZmZm5j9ddNFFF13kP6uqqqqqquI/FDuxEzux4z+SJEmSJEniPxEREREREeE/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP6uqqqqqqto/KVyPwvUo3D+e2Imd2IndPxzHcRzHcdw/btu2bdu23T9HWO5phOXePwAAAAAAAOA/hBBCCCGE4D8AAAAAAADgP3zwwQcffOA/AAAAAAAA4D9QB3VQB3XgPzmO4ziO4+A/whT5rBtM4T95DeU1lNfgP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/R9wRd8Qd4T900UUXXXThPxEREREREeE/C1nIQhay4D/E5ApicgXhP6uqqqqqquA/FbycgpdT4D+kcD0K16PgP/Hw8PDw8OA/sRM7sRM74T9vZZ9DaoLhP3Icx3Ecx+E/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhPxEREREREeE/DcE62rxP4T+MMcYYY4zhP1EURVEUReE/AAAAAAAA4T/RC73QC73gP/jggw8++OA/TKQHKme34D/x8PDw8PDgPxM/o8TPKOE/8RVf8RVf4T8OJFphcyDhPzmO4ziO4+A/iREjRowY4T/CFPmsG0zhPxEREREREeE/NpTXUF5D4T/lJ8RZ+QnhP7ETO7ETO+E/BqLSkT0D4T8zMzMzMzPhPyNl4OnW/OA/LFG7ErUr4T9T59ceclnhP0mSJEmSJOE/8fDw8PDw4D8w6Av6gr7gP93TCMs9jeA/XXTRRRdd4D8DF7jABS7gPwAAAAAAAOA/0AIt0AIt4D+GLGQhC1ngP4QQQgghhOA/QUyuICZX4D+yAmGkHSvgP1VVVVVVVeA/8MXVDzoq4D8VvJyCl1PgP3+lQK1fKeA/UrgehetR4D8cUWDSqXngP6GgoKCgoOA/9lttDE134D/sxE7sxE7gP3ACJ3ACJ+A/463sc0hN4D8hVpTGRybgPwAAAAAAAOA/WQKb9pMl4D8AAAAAAADgP04CcaHmJOA/kiRJkiRJ4D/3QwJvPyTgP34E9xHcR+A/AkVbDZ4j4D/uaYTlnkbgPzACIzACI+A/AAAAAAAA4D/gKLvfKLvfP3d3d3d3d98/jmVQKky83z8uGYJ1tHnfPzgfg/MxON8/+N5777333j8IrBxaZDvfP7/v+77v+94/0Ofz+Xw+3z8AAAAAAIDfP/AH/AF/wN8/IPiBH/iB3z97a8M0d8HfP4QPPvjgg98/c/TN0TdH3z9FeqBydgvfP5/0SZ/0Sd8/Dw8PDw8P3z/ZLKj2nEzfP/EzSvyMEt8/Rs6w4FLZ3j9f8RVf8RXfP31no76zUd8/lPHbpZ6M3z89QvWZtsbfPwAAAAAAAOA/Dnj84YDH3z8AAAAAAADgP/LX7KhFyN8/AAAAAAAA4D+ZS4QnBcnfPwAAAAAAAOA//iZ/k7/J3z8AAAAAAADgPyB1yh91yt8/cVZ+QpyV3z9hHxf2cWHfP9/yLd/yLd8/PiInCHdj3z9hfleLmzDfPzqkJhhvZd8/mpmZmZmZ3z+P5g82Hs3fP1ikDDzdmt8/sxpFHDpp3z+cj8H5GJzfP2vfsPYNa98/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "According to github, when was Regression added to "
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ]
- ],
- "hovertemplate": "agent_name=code_o1_03_february_ablation-toolcalling-manager index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_03_february_ablation-toolcalling-manager",
- "line": {
- "color": "#FF6692",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_03_february_ablation-toolcalling-manager",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAAAAAFVVVVVVVdU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/ntiJndiJ3T/btm3btm3bP97d3d3d3d0/AAAAAAAA3D9aWlpaWlraPxzHcRzHcdw/KK+hvIby2j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T87sRM7sRPbPxzHcRzHcdw/27Zt27Zt2z9huacRlnvaP7y7u7u7u9s/11prrbXW2j8AAAAAAADcPyebbLLJJts/WlpaWlpa2j/btm3btm3bP6uqqqqqqto/I591gyny2T8or6G8hvLaP1y+5Vu+5ds/MzMzMzMz2z+J2pWoXYnaP3qe53me59k/s6asKWvK2j8vuuiiiy7aP1uwBVuwBds/velNb3rT2z9t1Hc26jvbP6uqqqqqqto/iMb60Fgf2j+amZmZmZnZPxkZGRkZGdk/2Ymd2Imd2D9+DqkJxlvZPy+hvYT2Eto/mpmZmZmZ2T9JkiRJkiTZPzqL6Syms9g/7mmE5Z5G2D+yFUHDSd3XP3d3d3d3d9c/EayjzfsU1z+21lprrbXWP5ZlWZZlWdY/AAAAAAAA1z9XaqVWaqXWP0422WSTTdY/jYn0QOXs1j+Ih4eHh4fXP3PtwFw7MNc/t23btm3b1j8g0QqbA4nWP47jOI7jONY/r169evXq1T/yWTeYIp/VPzCW/GLJL9Y/UV5DeQ3l1T8KcVZ+QpzVP3ZiJ3ZiJ9Y/v6vFTZjf1T9mZmZmZmbWP/PDImXg6dY/onYlalei1j/S1Pm1h1zWP4ZhGIZhGNY/1tXV1dXV1T9lTVlT1pTVP1VVVVVVVdU/F1100UUX1T9ObWpTm9rUP/VJn/RJn9Q/lVEZlVEZ1T9Ob3rTm97UP1VVVVVVVdU/1Hc26jsb1T8mTv2eW+LUP1VVVVVVVdU/Ffji6gcd1T85BS+n4OXUP1VVVVVVVdU/H4XrUbge1T+bB7nrZ4vVP/b19fX19dU/Iz6BVHJe1j92Yid2YifWP9dojdZojdY/n0NqgvFW1j9dy8HNfiHWP0xoL6G9hNY/Y251Rirm1j+x9g1r37DWPwJxoeYkENc/t23btm3b1j9WemphpafWP0xnMZ3FdNY/ZCELWchC1j9Y7mmE5Z7WP9ZmbdZmbdY/CRpO6r481j9W6AxW6AzWP2ZmZmZmZtY/fa2eHQI31j9t3qe4ZAjWP2DW+2W9X9Y/MsYYY4wx1j+6SQwCK4fWP7dt27Zt29Y/q9VqtVqt1j8AAAAAAIDWP7UlbUlb0tY/N3IjN3Ij1z/LiD6gOvbWP8omm2yyydY/3Wl1p9Wd1j+NifRA5ezWP61z5QHJOtc/iIeHh4eH1z8cKRrij1vXP3PtwFw7MNc/iOIvcoYF1z+3bdu2bdvWP1uGDtjtsdY/INEKmwOJ1j/Am0eoPtPWP6uqqqqqqtY/QzpvMaTz1j+2bNmyZcvWP6lFyF+zo9Y/yWfdYIp81j+vsjij3cPWP5020GkDndY/tNpZ7ax21j8N5TWU11DWP9aAK9aAK9Y/mRrYO6YG1j/iVSReReLVP3ZiJ3ZiJ9Y/SS9/2kID1j+/q8VNmN/VP9nnkJpgvNU/mpmZmZmZ1T+hu0oxQXfVP1VVVVVVVdU/0j5IBtQz1T8TtStRuxLVP/KUIE8J8tQ/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ]
- ],
- "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_03_february_fix-print-outputs",
- "line": {
- "color": "#B6E880",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_03_february_fix-print-outputs",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAA==",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9GF1100UXnP1VVVVVVVeU/dmIndmIn5j+3bdu2bdvmP1VVVVVVVeU/AAAAAAAA5j+1tLS0tLTkP1VVVVVVVeU/UV5DeQ3l5T9mZmZmZmbmP1VVVVVVVeU/XXTRRRdd5D9Ob3rTm97kPwAAAAAAAOQ/MzMzMzMz4z9iJ3ZiJ3biP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/jDHGGGOM4T8AAAAAAADhP3TRRRdddOE/4uHh4eHh4T+SJEmSJEniP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhP2IYhmEYhuE/d8QdcUfc4T8vuuiiiy7iP9InfdInfeI/IQtZyEIW4j9HfWejvrPhPwAAAAAAAOI/aKwPjfWh4T9I4XoUrkfhP5KRkZGRkeE/sRM7sRM74T9vZZ9DaoLhP3Icx3Ecx+E/dNFFF1104T9JkiRJkiThP3UW01lMZ+E/uacRlnsa4T/VfXlsRdDgPxEREREREeE/DcE62rxP4T8IIYQQQgjhPzEMwzAMw+A/AAAAAAAA4T/RC73QC73gP3zwwQcffOA/TKQHKme34D94eHh4eHjgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhP36x5BdLfuE/8xrKayiv4T8De8fUwN7hP9IgDdIgDeI/pSN7BqLS4T+amZmZmZnhP8rA0635YeE/LFG7ErUr4T9T59ceclnhP0mSJEmSJOE/UVFRUVFR4T9f0Bf0BX3hP5Z7GmG5p+E/dNFFF1104T8UoQhFKELhPxEREREREeE/sRM7sRM74T8WspCFLGThPzTRRBNNNOE/BTG5gphc4T9BGGnHCoThP6uqqqqqquE/UoEvrn7Q4T99aKwPjfXhP29nSMzbGeI/PQrXo3A94j+LleEbUWDiPzIyMjIyMuI/Kjkvi/gE4j92Yid2YifiP7If+7Ef++E/UhOMt7LP4T+zX4gVpfHhP3Icx3Ecx+E/1hmpmFud4T+/Ye0b1r7hP18Z2+/oleE/btu2bdu24T+c6xjFuY7hP/Maymsor+E/HYGirQbP4T+E5Z5GWO7hP9IgDdIgDeI/RdBwUvfl4T9Sdr9Rdr/hP97d3d3d3eE/WQalwsT74T/ep7hkCNbhP/QxOB+D8+E/zjnnnHPO4T9Ei2zn+6nhP2IYhmEYhuE/aTQajUaj4T8AAAAAAMDhP2fMGXPGnOE/oRd6oRd64T9gxQkpeZbhP3TRRRdddOE/LBWxVMRS4T8qZ7fwqzHhP9wUo4a/TeE/aWlpaWlp4T/Ircs74EjhPxaykIUsZOE/P1pNQhR/4T+amZmZmZnhP8afSDileeE/RStsDiRa4T+xEzuxEzvhP8dxHMdxHOE/smsTJbs24T+JESNGjBjhPz801ofG+uA/TJHPusEU4T83YKimYy7hP0jhehSuR+E/ianEVGIq4T/YUF5DeQ3hP9F7JtF7JuE/5SfEWfkJ4T/uUN0O1e3gPyEN0iAN0uA/DVjSy5+24D+fgah0ZM/gP2dAKLlTtOA/mpmZmZmZ4D+aP9h4NH/gP6hb88MiZeA/axRx6KR94D+WqF2J2pXgPw==",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs2 index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_03_february_fix-print-outputs2",
- "line": {
- "color": "#FF97FF",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_03_february_fix-print-outputs2",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA6D+amZmZmZnpP6uqqqqqquo/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/mpmZmZmZ6T9GF1100UXnP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkP1VVVVVVVeU/AAAAAAAA5j+1tLS0tLTkP+Q4juM4juM/bCivobyG4j8zMzMzMzPjP/Q8z/M8z+M/6aKLLrro4j84velNb3rjP6uqqqqqquI/MzMzMzMz4z8UO7ETO7HjP2gvob2E9uI/27Zt27Zt4z8Jyz2NsNzjP0REREREROQ/nXPOOeec4z8AAAAAAADjP2WTTTbZZOM/09LS0tLS4j8zMzMzMzPjP+Q4juM4juM/bzBFPusG4z/lNZTXUF7jPxQ7sRM7seM/AAAAAAAA5D9L1K5E7UrkPyVJkiRJkuQ/NmVNWVPW5D9ddNFFF13kP/VJn/RJn+Q/QxaykIUs5D/PRn1no77jPwAAAAAAAOQ/5hS8nIKX4z8zMzMzMzPjP3Nzc3Nzc+M/O7ETO7ET4z/BeCv7HFLjP2gvob2E9uI/MzMzMzMz4z+3bdu2bdviP2wor6G8huI/c08jLPc04j9+eWxF0HDiP6uqqqqqquI/sI4271Nc4j+VUkoppZTiP7Msy7Isy+I/AAAAAAAA4z8zMzMzMzPjP+miiy666OI/w6/GRHqg4j/T0tLS0tLiPzDXDsy1A+M/MzMzMzMz4z+/XerJ+O3iP6uqqqqqquI/kyZNmjRp4j+DKfJZN5jiP8aSXyz5xeI/bCivobyG4j+SJEmSJEniP9IgDdIgDeI/dWTPQFQ64j9mZmZmZmbiP8HTrflhkeI/uxK1K1G74j+Ops6vPeTiP8MwDMMwDOM/09LS0tLS4j+/oC/oC/riP+MFMileIOM/6aKLLrro4j8xhznMYQ7jPzMzMzMzM+M/0y/90i/94j+ykIUsZCHjP+2yyy677OI/TK4gJlcQ4z/PLXHq99ziP6uqqqqqquI/8yQyDdvN4j+n4OUUvJziP2r9SoFav+I/4XoUrkfh4j/zIHf9bLHiP4OCgoKCguI/cl4W8Qmk4j9iJ3ZiJ3biP5IkSZIkSeI/NcF4K/sc4j/2C7GiND7iP+0ltJfQXuI/OyMVc6sz4j9UgjwlyFPiPzUngbhQc+I/kiRJkiRJ4j94+yGBtx/iP+4juI/gPuI/IQtZyEIW4j9zTyMs9zTiP9IgDdIgDeI/4qTuy2Mr4j+SJEmSJEniP2ZmZmZmZuI/y6BUmHg/4j9Hm/cpLhniP/QxOB+D8+E/zjnnnHPO4T/sUbgehevhP3Icx3Ecx+E/aTQajUaj4T8AAAAAAMDhP3fEHXFH3OE/gh/4gR/44T/lWUb0AdXhP/DBBx988OE/3xx9c/TN4T92C78aE+nhPz0gWefKA+I/Hh4eHh4e4j/8cevyDjjiPyV+RomfUeI/oBvz9NFq4j+SJEmSJEniP8oVxOQKYuI/U0/Gb5d64j9EhnsVzJLiPxzHcRzHceI/bBMluzZR4j8SI0aMGDHiP5IkSZIkSeI/mCKfdYMp4j/TMZcITwriPyIiIiIiIuI/kuZIc6Q54j+vobyG8hriP1Kn/FGn/OE/y0+Is/IT4j/2cWEfF/bhP4qd2Imd2OE/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_03_february_goodoldtext-unbroken index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_03_february_goodoldtext-unbroken",
- "line": {
- "color": "#FECB52",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_03_february_goodoldtext-unbroken",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAA==",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/ZmZmZmZm5j9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j+3bdu2bdvmP3d3d3d3d+c/AAAAAAAA5j+XlpaWlpbmP8dxHMdxHOc/UV5DeQ3l5T9mZmZmZmbmP7dt27Zt2+Y/0UUXXXTR5T9Ob3rTm97kP1VVVVVVVeU/w/UoXI/C5T/FTuzETuzkP1VVVVVVVeU/btu2bdu25T98GmG5pxHmP1VVVVVVVeU/rbXWWmut5T8AAAAAAADlP1VVVVVVVeU/tbS0tLS05D91UAd1UAflPxzHcRzHceQ/HEyRz7rB5D/YUF5DeQ3lPzVIgzRIg+Q/zczMzMzM5D9L1K5E7UrkPyVJkiRJkuQ/NmVNWVPW5D9ddNFFF13kP/VJn/RJn+Q/QxaykIUs5D/PRn1no77jPwAAAAAAAOQ/5hS8nIKX4z/Xo3A9CtfjP3Nzc3Nzc+M/O7ETO7ET4z/7HFITjLfiP+0ltJfQXuI/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhP5qZmZmZmeE/3qe4ZAjW4T8RQgghhBDiP3Icx3Ecx+E/AAAAAAAA4j+SG7mRG7nhP/DBBx988OE/CCpnt/Cr4T/i4eHh4eHhPyELWchCFuI/kiRJkiRJ4j9TT8Zvl3riP47jOI7jOOI/kyZNmjRp4j+YIp91gyniP+xRuB6F6+E/r6G8hvIa4j8De8fUwN7hP9IgDdIgDeI/dWTPQFQ64j8AAAAAAADiPxl4ujU/LOI/9DE4H4Pz4T/xRlPn1x7iP5IkSZIkSeI/cnJycnJy4j+PuCPuiDviP7xAJsULZOI/jC666KKL4j8rWclKVrLiP4Mt2IIt2OI/0y/90i/94j+ykIUsZCHjP+2yyy677OI/C2JyBTG54j/PLXHq99ziP6uqqqqqquI/8yQyDdvN4j+8nIKXU/DiP2r9SoFav+I/4XoUrkfh4j9brAzfiALjPyMjIyMjI+M/FvEJpJLz4j9P7MRO7MTiP3Mpl3Ipl+I/GG9ln0Nq4j85uNkvxIriP+0ltJfQXuI/OyMVc6sz4j9UgjwlyFPiP5gin3WDKeI/AAAAAAAA4j+Kcx2jONfhP3AfwX0E9+E/IQtZyEIW4j+E5Z5GWO7hP3Icx3Ecx+E/RdBwUvfl4T9yTQRyTQTiP97d3d3d3eE/52v17BC44T/ep7hkCNbhP7GRDhvpsOE/zjnnnHPO4T9Ei2zn+6nhP2IYhmEYhuE/WSwWi8Vi4T8AAAAAAIDhP2fMGXPGnOE/khu5kRu54T9gxQkpeZbhP3TRRRdddOE/BhkXZFyQ4T8IKme38KvhP3Icx3Ecx+E/pqWlpaWl4T/ij1uXd8DhPxolfkaJn+E/l8r2rgO64T+amZmZmZnhP8afSDileeE/ezJ+u9ST4T900UUXXXThP+Q4juM4juE/pPMWQzpv4T+MGDFixIjhP1uE/DU7auE/whT5rBtM4T83YKimYy7hP0jhehSuR+E/ianEVGIq4T/YUF5DeQ3hP9F7JtF7JuE/rfyEOCs/4T8j8SoSryLhP7ETO7ETO+E/OUG4G/se4T8GotKRPQPhP+vSY/5eG+E/MzMzMzMz4T/ti6jW2RfhPw==",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "What is the latest chronological year date written"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_03_february_remove-navigational index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_03_february_remove-navigational",
- "line": {
- "color": "#636efa",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_03_february_remove-navigational",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAA==",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/FDuxEzux4z+SJEmSJEniPzMzMzMzM+M/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j8AAAAAAADgPzEMwzAMw+A/dNFFF1104T8hC1nIQhbiP6uqqqqqquI/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP22yySabbOI/09LS0tLS4j+SJEmSJEniP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/GPQFfUFf4D+66KKLLrrgPxEREREREeE/FrKQhSxk4T/E5ApicgXhP1VVVVVVVeE/aKwPjfWh4T/sUbgehevhP5KRkZGRkeE/sRM7sRM74T+pCcZb2efgP/cS2ktoL+E/37D2DWvf4D9JkiRJkiThP3UW01lMZ+E/lnsaYbmn4T8NJ3VfHlvhP5qZmZmZmeE/DcE62rxP4T8IIYQQQgjhP1EURVEUReE/AAAAAAAA4T/RC73QC73gP/jggw8++OA/TKQHKme34D/x8PDw8PDgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhPxEREREREeE/eQ3lNZTX4D/lJ8RZ+QnhP7ETO7ETO+E/1uImzO9q4T8zMzMzMzPhPyNl4OnW/OA/yOB8DM7H4D+FN5o6v/bgP0mSJEmSJOE/UVFRUVFR4T9f0Bf0BX3hPwOZFC+QSeE/dNFFF1104T8UoQhFKELhP8EWbMEWbOE/UhmVURmV4T8WspCFLGThPzTRRBNNNOE/xOQKYnIF4T95DeU1lNfgP6uqqqqqquA/sd0sTyLT4D8qeDkFL6fgP3o7Q2LezuA/9ihcj8L14D/sZ4uV4RvhP0FBQUFBQeE/PoFUcl4W4T+xEzuxEzvhP/EVX/EVX+E/b2WfQ2qC4T9ws1+IFaXhP3Icx3Ecx+E/1hmpmFud4T900UUXXXThP8IU+awbTOE/27Zt27Zt4T+c6xjFuY7hP3UW01lMZ+E/FG01eI5A4T+oEZZ7GmHhP7ETO7ETO+E/cVL35bEV4T/x8PDw8PDgP83MzMzMzOA/HgI3lkGp4D/S5n2KS4bgP6cQaAqBpuA/xhhjjDHG4D+kcD0K16PgPzEMwzAMw+A/OBwOh8Ph4D8AAAAAAADhP0fcEXfEHeE/sRM7sRM74T9WnJCSZxnhP/jggw8++OA/UxFLRSwV4T+7hV+NifTgPxEREREREeE/8fDw8PDw4D+7vAOOFA3hPw/MtQNz7eA/jnn6aDUJ4T9JkiRJkiThP8TkCmJyBeE/DiRaYXMg4T+xEzuxEzvhP1VVVVVVVeE/pPMWQzpv4T8LFSpUqFDhP01c6d6AMuE/whT5rBtM4T+eFCR/XmXhP36x5BdLfuE/jVvGLeOW4T+U11BeQ3nhP5KRkZGRkeE/dNFFF1104T+MMcYYY4zhP0IapEEapOE/+x6RE4S74T8+A1HpyJ7hP29ln0NqguE/ZmZmZmZm4T9epZigu0rhP/cS2ktoL+E/fJu/wqxG4T8sUbsStSvhPw==",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_03_february_text_high-reasoning-effort index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_03_february_text_high-reasoning-effort",
- "line": {
- "color": "#EF553B",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_03_february_text_high-reasoning-effort",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVdU/AAAAAAAA4D+amZmZmZnZPwAAAAAAAOA/27Zt27Zt2z8AAAAAAADYPxzHcRzHcdw/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgP97d3d3d3d0/AAAAAAAA3D8eHh4eHh7ePxzHcRzHcdw/KK+hvIby2j+amZmZmZnZP9u2bdu2bds/L7rooosu2j+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T/ZiZ3YiZ3YPy+hvYT2Eto/27Zt27Zt2z9huacRlnvaP5qZmZmZmdk/xhhjjDHG2D8AAAAAAADaPyebbLLJJts/PDw8PDw83D8d1EEd1EHdPxzHcRzHcdw/0LrBFPms2z8or6G8hvLaPxqkQRqkQdo/mpmZmZmZ2T+J2pWoXYnaP9u2bdu2bds/s6asKWvK2j+jiy666KLbP1uwBVuwBds/velNb3rT2z9t1Hc26jvbPwAAAAAAANw/27Zt27Zt2z/hehSuR+HaP5ybm5ubm9s/O7ETO7ET2z8KxlvZ55DaPya0l9BeQts/w9o3rH3D2j/btm3btm3bPx/BfQT3Edw/GmG5pxGW2z8EDSd1Xx7bP7y7u7u7u9s/Q7CONu9T3D/nnHPOOefcPxzHcRzHcdw/AAAAAAAA3D/dyI3cyI3cPx988MEHH9w/NSbSA5Wz2z9LS0tLS0vbP64dmGsH5to/27Zt27Zt2z8yfrvUk/HbPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbPylcj8L1KNw/oryG8hrK2z/5CXFWfkLcP33Lt3zLt9w/VDqyZyAq3T/NzMzMzMzcP2t+WKQMPN0/qV2J2pWo3T/3kMuKgRLeP27btm3btt0/Hh4eHh4e3j9xR9wRd8TdPyCT4gUyKd4/jC666KKL3j/vda973evePz/pkz7pk94/3uM93uM93j/f9KY3vendP5dddtlll90/eDbqOxv13T9G2rECYaTdPwAAAAAAAN4/3ixPItOw3T+Dl1PwcgrePw2JeTtDYt4/uB6F61G43j/IXT9brAzfP19fX19fX98/FEgl52UR3z8ndmIndmLfPyD7sR/7sd8/OqQmGG9l3z83+4VYURrfPwntJbSX0N4/hOjxXTiI3j/WvmHtG9beP/DolbH9jt4/kiRJkiRJ3j9bWOmphZXePwnuI7iP4N4/6k1vetOb3j9HWO5phOXePx/qoR7qod4/VwQNJ3Vf3j9fzKdezKfeP2ZmZmZmZt4/4MYyKBUm3j+KS4ZgHW3ePy6e3OLJLd4/dM4555xz3j+4HoXrUbjeP57neZ7ned4/j8fj8Xg83j8AAAAAAADeP3FH3BF3xN0/ntiJndiJ3T9Ux97aMM3dP5NNNtlkk90/Wt1pdafV3T+K9EDl7BbeP97d3d3d3d0/Hh4eHh4e3j+kaIg/bl3eP+JnlPgZJd4/le1dB3Rj3j++4iu+4iveP3rxJxJOad4/u9ST8dul3j+qz7Q1/m7eP47jOI7jON4/Y0jnLYZ03j/16tWrV6/eP7o3oExc6d4/PusGU+Sz3j8uEZ4UJH/eP7gehetRuN4/+MJ74b3w3j+H8hrKayjfP59J9J5J9N4/4qz8hDgr3z/43nvvvffeP0/sxE7sxN4/EjlBuBv73j9hfleLmzDfPzqkJhhvZd8/mpmZmZmZ3z+P5g82Hs3fP1ikDDzdmt8/sxpFHDpp3z84H4PzMTjfPwgffPDBB98/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_04_february_submission index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_04_february_submission",
- "line": {
- "color": "#00cc96",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_04_february_submission",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D+amZmZmZnpP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/sRM7sRM74T+SJEmSJEniPxEREREREeE/AAAAAAAA4j/T0tLS0tLiP+Q4juM4juM/XkN5DeU15D/NzMzMzMzkP/Q8z/M8z+M/XXTRRRdd5D84velNb3rjP6uqqqqqquI/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/kiRJkiRJ4j9PIyz3NMLiPzMzMzMzM+M/lVJKKaWU4j8AAAAAAADjP22yySabbOI/09LS0tLS4j8zMzMzMzPjP6uqqqqqquI/bzBFPusG4z9sKK+hvIbiP9IgDdIgDeI/ZmZmZmZm4j+7ErUrUbviP5IkSZIkSeI/p6wpa8qa4j/poosuuujiP9InfdInfeI/IQtZyEIW4j9HfWejvrPhP1VVVVVVVeE/PzTWh8b64D9I4XoUrkfhP/Hw8PDw8OA/2Ymd2Imd4D+pCcZb2efgP3sJ7SW0l+A/SpCnBHlK4D8AAAAAAADgP34E9xHcR+A/3dMIyz2N4D+c1H15bEXgPwAAAAAAAOA/afM+xSVD4D+EEEIIIYTgPxAEQRAEQeA/AAAAAACA4D/RC73QC73gP3zwwQcffOA/b+FXYyI94D94eHh4eHjgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhPxEREREREeE/eQ3lNZTX4D9WfkKclZ/gP5AGaZAGaeA/N2F+V4ub4D/NzMzMzMzgP3sJ7SW0l+A/yOB8DM7H4D+2h1xWDJTgPxiGYRiGYeA/kZCQkJCQ4D8w6Av6gr7gP93TCMs9jeA/uuiiiy664D8Oc5jDHObgP2ELtmALtuA/cQiHcAiH4D+GLGQhC1ngPyywwAILLOA/QUyuICZX4D8WCCPtWIHgP1VVVVVVVeA/8MXVDzoq4D8VvJyCl1PgP3+lQK1fKeA/AAAAAAAA4D+YdGoe5K7fP19fX19fX98/XG0MTXew3z8ndmIndmLfPyD7sR/7sd8/AAAAAAAA4D+9U9dycLPfPwAAAAAAAOA/TvvJEti03z9r37D2DWvfPyryWTeYIt8/27Zt27Zt3z8SePshgbffPwT3EdxHcN8//HVJ5cO43z8jLPc0wnLfP6D7uZ/7ud8/yFYEDSd13z/gKLvfKLvfPwAAAAAAAOA/jmVQKky83z8AAAAAAADgPyHQFAJNIeA/AAAAAAAA4D9YObTIdr7fP9/3fd/3fd8/0Ofz+Xw+3z8AAAAAAADfP9AX9AV9Qd8/P/ADP/AD3z9xQkqeZUTfPwgffPDBB98/c/TN0TdH3z9FeqBydgvfP5/0SZ/0Sd8/Dw8PDw8P3z/ZLKj2nEzfP/EzSvyMEt8/964DujFP3z9f8RVf8RXfP3usZeiA3d4/u9ST8dul3j8wS8oBkeHeP8dxHMdxHN8/Kmj1pYJW3z/58ePHjx/fP7o3oExc6d4/KvJZN5gi3z/L4ox2D1vfP5NfLPnFkt8//iZ/k7/J3z9DeQ3lNZTfPyB1yh91yt8/cVZ+QpyV3z/LX7L8JcvfPwAAAAAAAOA/S3r50xYa4D8AAAAAAADgP742Yl16zN8/AAAAAAAA4D+P5g82Hs3fP1ikDDzdmt8/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_04_february_submission-medium index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_04_february_submission-medium",
- "line": {
- "color": "#ab63fa",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_04_february_submission-medium",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3w=",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkP1VVVVVVVeU/AAAAAAAA5D/T0tLS0tLiP3Icx3Ecx+E/bCivobyG4j+amZmZmZnhP5IkSZIkSeI/6aKLLrro4j8hC1nIQhbiP1VVVVVVVeE/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/kiRJkiRJ4j9PIyz3NMLiPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP3TRRRdddOE/4uHh4eHh4T/xFV/xFV/hPzmO4ziO4+A/6wZT5LNu4D95DeU1lNfgP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/R9wRd8Qd4T+66KKLLrrgP7AFW7AFW+A/C1nIQhay4D/E5ApicgXhP6uqqqqqquA/FbycgpdT4D+kcD0K16PgP/Hw8PDw8OA/2Ymd2Imd4D+pCcZb2efgP3sJ7SW0l+A/37D2DWvf4D8lSZIkSZLgP3kN5TWU1+A/3dMIyz2N4D+c1H15bEXgPwAAAAAAAOA/afM+xSVD4D+EEEIIIYTgPzEMwzAMw+A/AAAAAACA4D/wAz/wAz/gP3zwwQcffOA/b+FXYyI94D94eHh4eHjgPwRz7cBcO+A/UAd1UAd14D82BxKtsDngPxzHcRzHceA/ggMHDhw44D8AAAAAAADgP5NfLPnFkt8/AAAAAAAA4D/H1MDeMTXgPwAAAAAAAOA/Mb+rxU2Y3z8AAAAAAADgP1ikDDzdmt8/OB+D8zE43z+U8EZT59feP57neZ7ned4/Hh4eHh4e3j+hL+gL+oLePyCT4gUyKd4/jC666KKL3j/vda973evePz/pkz7pk94/3uM93uM93j/f9KY3vendP5dddtlll90/eDbqOxv13T9G2rECYaTdPwAAAAAAAN4/nkSmYbtZ3j+sD431obHePw2JeTtDYt4/FK5H4XoU3j8pMOnUPMjdPx4eHh4eHt4/zCI+gVRy3j92Yid2YifeP57neZ7ned4/dEhNMN7K3j83+4VYURrfP4X2EtpLaN8/6fFdOIge3z9r37D2DWvfP2P7Hb0ytt8/27Zt27Zt3z8SePshgbffPwAAAAAAAOA//HVJ5cO43z8jLPc0wnLfP9/yLd/yLd8/yFYEDSd13z+fejGfejHfP+/u7u7u7t4/xfuR03yt3j9cMgTraPPePzgfg/MxON8/fO+999573z8IrBxaZDvfPw==",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "When you take the average of the standard populati"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_04_february_submission3 index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_04_february_submission3",
- "line": {
- "color": "#FFA15A",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_04_february_submission3",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMA==",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9ddNFFF13kP6uqqqqqquI/sRM7sRM74T8AAAAAAADgPxEREREREeE/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j8AAAAAAADgP57neZ7ned4/AAAAAAAA4D/qTW9605vePwAAAAAAAOA/pHA9Ctej4D8AAAAAAADgPwntJbSX0N4/btu2bdu23T9HWO5phOXePwAAAAAAAOA/hBBCCCGE4D8AAAAAAADgPwgffPDBB98/AAAAAAAA4D9QB3VQB3XgPwAAAAAAAOA/6wZT5LNu4D8AAAAAAADgP5AGaZAGaeA/AAAAAAAA4D9kcD4G52PgPwAAAAAAAOA/0Bf0BX1B3z8AAAAAAADgP7AFW7AFW+A/AAAAAAAA4D99Z6O+s1HfPwAAAAAAAOA/1ofG+tBY3z8=",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_04_february_submission4 index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_04_february_submission4",
- "line": {
- "color": "#19d3f3",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_04_february_submission4",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQF",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ]
- ],
- "hovertemplate": "agent_name=code_o1_04_february_submission5 index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_04_february_submission5",
- "line": {
- "color": "#FF6692",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_04_february_submission5",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/mpmZmZmZ6T8vuuiiiy7qP6uqqqqqquo/O7ETO7ET6z9JkiRJkiTpP5qZmZmZmek/AAAAAAAA6j94eHh4eHjoP8dxHMdxHOc/Q3kN5TWU5z8AAAAAAADoPxiGYRiGYeg/RhdddNFF5z9kIQtZyELmP1VVVVVVVeU/exSuR+F65D8UO7ETO7HjP2gvob2E9uI/27Zt27Zt4z8Jyz2NsNzjPzMzMzMzM+M/lVJKKaWU4j8AAAAAAADjP22yySabbOI/09LS0tLS4j+SJEmSJEniP6uqqqqqquI/bzBFPusG4z/lNZTXUF7jPxQ7sRM7seM/AAAAAAAA5D9L1K5E7UrkP/Q8z/M8z+M/Bn1BX9AX5D+jiy666KLjPzMzMzMzM+M/OL3pTW964z9MriAmVxDjP1VVVVVVVeM/5hS8nIKX4z/Xo3A9CtfjPxQUFBQUFOQ/7MRO7MRO5D9NMN7KPofkP9pLaC+hveQ/XXTRRRdd5D8lSZIkSZLkP15DeQ3lNeQ/5p5GWO5p5D9fHlsRNJzkP0REREREROQ/JkOwjjbv4z+dc84555zjP/Q8z/M8z+M/AAAAAACA4z8zMzMzMzPjP+miiy666OI/oHJ2C78a4z9LS0tLS0vjPzDXDsy1A+M/4yu+4iu+4j9TT8Zvl3riP47jOI7jOOI/kB8/fvz44T+YIp91gyniP1nyiyW/WOI/bCivobyG4j8hzspPiLPiP/Mt3/It3+I/E+Z3tbgJ4z/NzMzMzMziP8HTrflhkeI/V6J2JWpX4j+/9pDLioHiP5IkSZIkSeI/EhISEhIS4j+PuCPuiDviP7xAJsULZOI/L7rooosu4j8g/ehHP/rhPyIiIiIiIuI/kiRJkiRJ4j+nN73pTW/iP5VSSimllOI/yhXE5Api4j9sKK+hvIbiP1VVVVVVVeI/EpmG7WZ54j+n4OUUvJziP2r9SoFav+I/j8L1KFyP4j/zIHf9bLHiP9PS0tLS0uI/cl4W8Qmk4j9iJ3ZiJ3biP5IkSZIkSeI/GG9ln0Nq4j/2C7GiND7iP+0ltJfQXuI/OyMVc6sz4j8J8pQgTwniP5gin3WDKeI/kiRJkiRJ4j94+yGBtx/iP3AfwX0E9+E/HYGirQbP4T+E5Z5GWO7hP9IgDdIgDeI/4qTuy2Mr4j9yTQRyTQTiPyIiIiIiIuI/y6BUmHg/4j+wjjbvU1ziPzbSYSMdNuI/U0oppZRS4j+TGARWDi3iP4IgCIIgCOI/iUQikUgk4j8AAAAAAADiP3fEHXFH3OE/gh/4gR/44T9q7oK/ihPiPy+66KKLLuI/kiRJkiRJ4j/l7BZ+NSbiPz0gWefKA+I/Hh4eHh4e4j/8cevyDjjiPyV+RomfUeI/oBvz9NFq4j87qIM6qIPiP8oVxOQKYuI/HUi0wuZA4j++CmZJOSDiP47jOI7jOOI/eoshnbcY4j8SI0aMGDHiP5IkSZIkSeI/DqbIZ91g4j+imo65RHjiP1nyiyW/WOI/kuZIc6Q54j8N5TWU11DiPxK9ZxK9Z+I/kiRJkiRJ4j8rEq8i8SriP9IgDdIgDeI/kROEu7Hv4T8NRKUjewbiP/P32oh16eE/zczMzMzM4T+w8Wj+YOPhP0bKwNOt+eE/yYB6pnLd4T/C+Ricj8HhP6YxYBoDpuE/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_22-01_managedagent-summary_planning index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_22-01_managedagent-summary_planning",
- "line": {
- "color": "#B6E880",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_22-01_managedagent-summary_planning",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQg==",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgPxEREREREeE/AAAAAAAA4D8eHh4eHh7ePxzHcRzHcdw/KK+hvIby2j+amZmZmZnZPxiGYRiGYdg/RhdddNFF1z+RhSxkIQvZPwAAAAAAANg/mpmZmZmZ2T/ZiZ3YiZ3YP0J7Ce0ltNc/t23btm3b1j98GmG5pxHWP1VVVVVVVdU/pZRSSiml1D8AAAAAAADUP2WTTTbZZNM/tbS0tLS01D8WX/EVX/HVP1VVVVVVVdU/yWfdYIp81j9DeQ3lNZTXP9mJndiJndg/mpmZmZmZ2T/6GJyPwfnYP3qe53me59k/s6asKWvK2j8vuuiiiy7aP5qZmZmZmdk/pze96U1v2j9t1Hc26jvbPwAAAAAAANw/27Zt27Zt2z/hehSuR+HaP1paWlpaWto/O7ETO7ET2z+WfQ6pCcbbPxzHcRzHcdw/F1100UUX3T8lSZIkSZLcPxbTWUxnMd0/jbDc0wjL3T/msRVBw0ndP83MzMzMzNw/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z9bqZVaqZXaPyebbLLJJts/eqBydgu/2j8=",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "In the year 2022, and before December, what does \""
- ]
- ],
- "hovertemplate": "agent_name=code_o1_25-01_visioon index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_25-01_visioon",
- "line": {
- "color": "#FF97FF",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_25-01_visioon",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ=",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnJP1VVVVVVVdU/27Zt27Zt2z8AAAAAAADYP1VVVVVVVdU/MzMzMzMz0z900UUXXXTRP1VVVVVVVdU/2Ymd2Imd2D+3bdu2bdvWP5qZmZmZmdk/AAAAAAAA2D+XlpaWlpbWPzmO4ziO49g/Q3kN5TWU1z9mZmZmZmbWP1VVVVVVVdU/XXTRRRdd1D9kIQtZyELWP1VVVVVVVdU/exSuR+F61D92Yid2YifWP1VVVVVVVdU/JUmSJEmS1D8Jyz2NsNzTPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADSP3TRRRdddNE/09LS0tLS0j/UQR3UQR3UP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP1VVVVVVVdU/ZmZmZmZm1j/blahdidrVP1VVVVVVVdU/lTVlTVlT1j/RRRdddNHVP1VVVVVVVdU/ZCELWchC1j9dQUyuICbXP6uqqqqqqtY/jfWhsT401j/D9Shcj8LVP1VVVVVVVdU/xU7sxE7s1D/Z55CaYLzVPw==",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ]
- ],
- "hovertemplate": "agent_name=code_o1_29-01_text index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o1_29-01_text",
- "line": {
- "color": "#FECB52",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o1_29-01_text",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdo",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D+amZmZmZnZPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/mpmZmZmZ2T9GF1100UXXP1VVVVVVVdU/2Ymd2Imd2D+3bdu2bdvWP5qZmZmZmdk/AAAAAAAA3D9aWlpaWlraPzmO4ziO49g/Q3kN5TWU1z9mZmZmZmbWP1VVVVVVVdU/XXTRRRdd1D9kIQtZyELWP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j+WexphuafRPxEREREREdE/hBBCCCGE0D8AAAAAAADQPwgffPDBB88/8fDw8PDw0D+SJEmSJEnSP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP5dv+ZZv+dY/AAAAAAAA2D9qV6J2JWrXPxiGYRiGYdg/9AV9QV/Q1z9GF1100UXXPxdswRZswdY/etOb3vSm1z9icgUxuYLYPwAAAAAAANg/4eUUvJyC1z8K16NwPQrXP5eWlpaWltY/dmIndmIn1j9ln0NqgvHWP0J7Ce0ltNc/cFj7hrVv2D9JkiRJkiTZPzGdxXQW09k/YbmnEZZ72j+Uui+PrQjaP5qZmZmZmdk/WEeb9yku2T/GGGOMMcbYPxiGYRiGYdg/AAAAAAAA2D8YeqEXeqHXPz744IMPPtg/SQ9Uzm7h1z+Ih4eHh4fXP4K5dmCuHdg/+Yqv+Iqv2D/RCpsDiVbYPwAAAAAAANg/vXr16tWr1z+fdYMp8lnXP+UXS36x5Nc/Q3kN5TWU1z9kamDvmBrYP9mJndiJndg/OrJnICod2T/NzMzMzMzYP5Ey8HRrftg/Mjgfg/Mx2D+q82sPuazYPxiGYRiGYdg/GBgYGBgY2D8k7og74o7YP+5phOWeRtg/AAAAAAAA2D983ete97rXP9iCLdiCLdg/2Ymd2Imd2D+GLGQhC1nYP8YYY4wxxtg/YnIFMbmC2D8LhJF2rEDYPwAAAAAAANg/2G6WJ5Fp2D801ofG+tDYPzbZZJNNNtk/mpmZmZmZ2T96kLt+tljZP7q5ubm5udk/i/gEUsl52T+xEzuxEzvZP9mP/diP/dg/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "What's the last line of the rhyme under the flavor"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "In Valentina Reโs contribution to the 2017 book โW"
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "Compute the check digit the Tropicos ID for the Or"
- ],
- [
- "Could you help me out with this assignment? Our pr"
- ],
- [
- "Given this table defining * on the set S = {a, b, "
- ],
- [
- "What time was the Tri-Rail train that carried the "
- ],
- [
- "In the fictional language of Tizin, basic sentence"
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "I was trying to remember how well the Cheater Beat"
- ],
- [
- "The attached file contains a list of vendors in th"
- ],
- [
- "Review the chess position provided in the image. I"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "Who nominated the only Featured Article on English"
- ],
- [
- "The Latin root of the Yola word \"gimlie\" shares a "
- ],
- [
- "The attached file shows a list of books in the col"
- ],
- [
- "According to Google Finance, when was the first ye"
- ],
- [
- "Using bass clef notes, what is the age of someone "
- ],
- [
- "On a leap day before the year 2008, a joke was rem"
- ],
- [
- "On July 15, 2008, Phys.org published an article ab"
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "If there is anything that doesn't make sense in th"
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "The following numbers function similarly to ISBN 1"
- ],
- [
- "In the year 2022, and before December, what does \""
- ],
- [
- "What is the volume in milliliters of a system comp"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "The attached file lists accommodations in the reso"
- ],
- [
- "In the NIH translation of the original 1913 Michae"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ],
- [
- "You are Van Helsing, a renowned vampire hunter. A "
- ],
- [
- "Find the value of x to the nearest tenth: Lx = (d/"
- ],
- [
- "You are a telecommunications engineer who wants to"
- ],
- [
- "According to Box Office Mojo's 2020 Worldwide Box "
- ],
- [
- "How many applicants for the job in the PDF are onl"
- ],
- [
- "As of the 2020 census, what was the population dif"
- ],
- [
- "The Metropolitan Museum of Art has a portrait in i"
- ],
- [
- "How many slides in this PowerPoint presentation me"
- ],
- [
- "This is a secret message my friend gave me. It say"
- ],
- [
- "According to wikipedia, how many Asian countries s"
- ],
- [
- "The work referenced in footnote 397 of Federico La"
- ],
- [
- "I was referencing each of the tables in the file f"
- ],
- [
- "In Nature journal's Scientific Reports conference "
- ],
- [
- "The attached file shows the locomotives in the col"
- ],
- [
- "How many nonindigenous crocodiles were found in Fl"
- ],
- [
- "As a comma separated list with no whitespace, usin"
- ],
- [
- "According to the World Bank, which countries had g"
- ],
- [
- "The attached spreadsheet contains the sales of men"
- ],
- [
- "Who composed the song that was performed by a roos"
- ],
- [
- "I'm making a grocery list for my mom, but she's a "
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "In the 2018 VSCode blog post on replit.com, what w"
- ],
- [
- "Look at the attached image. The quiz is scored as "
- ],
- [
- "What writer is quoted by Merriam-Webster for the W"
- ],
- [
- "Examine the video at https://www.youtube.com/watch"
- ],
- [
- "Hi, I'm making a pie but I could use some help wit"
- ],
- [
- "In the Scikit-Learn July 2017 changelog, what othe"
- ],
- [
- "You are given this Excel file as a map. You start "
- ],
- [
- "How many images are there in the latest 2022 Lego "
- ],
- [
- "The attached image contains a Python script. Run t"
- ],
- [
- "I thought we could try a fun word puzzle together "
- ],
- [
- "On ScienceDirect, what is the difference to 3 deci"
- ],
- [
- "What is the final numeric output from the attached"
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "How many more blocks (also denoted as layers) in B"
- ],
- [
- "The longest-lived vertebrate is named after an isl"
- ],
- [
- "On the DeepFruits fruit detection graph on Connect"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "The attached PDF lists accommodations in the resor"
- ],
- [
- "This spreadsheet contains a list of clients for a "
- ],
- [
- "How many times was a Twitter/X post cited as a ref"
- ],
- [
- "During the first week of August 2015, one of the N"
- ],
- [
- "What is the surname of the equine veterinarian men"
- ],
- [
- "The YouTube channel Game Grumps began a Letโs Play"
- ],
- [
- "What is the last word before the second chorus of "
- ],
- [
- "Who did the actor who played Ray in the Polish-lan"
- ],
- [
- "I have the Standard plan in the image below, and I"
- ],
- [
- "In the endnote found in the second-to-last paragra"
- ],
- [
- "The book with the doi 10.1353/book.24372 concerns "
- ],
- [
- "Pull out the sentence in the following 5x7 block o"
- ],
- [
- "What is the latest chronological year date written"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- "Eva Draconis has a personal website which can be a"
- ],
- [
- "How many at bats did the Yankee with the most walk"
- ],
- [
- "According to Girls Who Code, how long did it take "
- ],
- [
- "The attached spreadsheet contains a list of books "
- ],
- [
- "How many pages if the 2023 IPCC report (85 pages v"
- ],
- [
- "It's May 2023, and I'm about to drive across the U"
- ],
- [
- "In Audre Lordeโs poem โFather Son and Holy Ghostโ,"
- ],
- [
- "On Cornell Law School website's legal information "
- ],
- [
- "How many edits were made to the Wikipedia page on "
- ],
- [
- "Consider the following symbols: ๐ ๐๐\n\nThis is a n"
- ],
- [
- "On the BBC Earth YouTube video of the Top 5 Sillie"
- ],
- [
- "What is the absolute difference in tens of thousan"
- ],
- [
- "The attached spreadsheet lists the locomotives own"
- ],
- [
- "The attached file lists the locomotives owned by a"
- ],
- [
- "Iโm thinking about selling my home, so I want to l"
- ],
- [
- "When was a picture of St. Thomas Aquinas first add"
- ],
- [
- "As of August 2023, who is the only winner of the U"
- ],
- [
- "Take the gender split from the 2011 Bulgarian cens"
- ],
- [
- "All of the individuals who formally held the posit"
- ],
- [
- "Hi, I was out sick from my classes on Friday, so I"
- ],
- [
- "If this whole pint is made up of ice cream, how ma"
- ],
- [
- "Which of the fruits shown in the 2008 painting \"Em"
- ],
- [
- "What country had the least number of athletes at t"
- ],
- [
- "In the YouTube 360 VR video from March 2018 narrat"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "Where were the Vietnamese specimens described by K"
- ],
- [
- "The cover of the August 2021 issue of Vogue shows "
- ],
- [
- "I'd like to learn more about some popular reality "
- ],
- [
- "I read a paper about multiwavelength observations "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$,"
- ],
- [
- "A standard Rubikโs cube has been broken into cubes"
- ],
- [
- "According to the USGS, in what year was the Americ"
- ],
- [
- "The attached Excel file contains the sales of menu"
- ],
- [
- "I'm curious about how much information is availabl"
- ],
- [
- "What percentage of the total penguin population ac"
- ],
- [
- "As of May 2023, how many stops are between South S"
- ],
- [
- "According to Openreview.net, at the NeurIPS 2022 C"
- ],
- [
- "Of the cities within the United States where U.S. "
- ],
- [
- "Who are the pitchers with the number before and af"
- ],
- [
- "In the 2015 Metropolitan Museum of Art exhibition "
- ],
- [
- "On June 6, 2023, an article by Carolyn Collins Pet"
- ],
- [
- "What is the area of the green polygon in the attac"
- ],
- [
- "What is the first name of the only Malko Competiti"
- ],
- [
- "The brand that makes these harnesses the dogs are "
- ],
- [
- "The year is 2022. I am at the National Air and Spa"
- ],
- [
- "What was the actual enrollment count of the clinic"
- ],
- [
- "What was the complete title of the book in which t"
- ],
- [
- "Bob was invited to participate in a game show, and"
- ],
- [
- "In NASA's Astronomy Picture of the Day on 2006 Jan"
- ],
- [
- "At the two-minute mark in the YouTube video upload"
- ],
- [
- "In the film Goldfinger, what color was the object "
- ],
- [
- "A 5-man group made up of one tank, one healer, and"
- ]
- ],
- "hovertemplate": "agent_name=code_o3-mini_03_february_remove-navigational index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_o3-mini_03_february_remove-navigational",
- "line": {
- "color": "#636efa",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_o3-mini_03_february_remove-navigational",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA",
- "dtype": "i2"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnJP1VVVVVVVcU/kiRJkiRJwj8AAAAAAADAPxzHcRzHccw/mpmZmZmZyT9GF1100UXHPwAAAAAAANA/FDuxEzux0z+SJEmSJEnSP1VVVVVVVdU/AAAAAAAA1D/T0tLS0tLSP3Icx3Ecx9E/XkN5DeU11D9mZmZmZmbWP1VVVVVVVdU/RhdddNFF1z9kIQtZyELWP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j+WexphuafRPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADSP2WTTTbZZNM/09LS0tLS0j+SJEmSJEnSP3Icx3Ecx9E/whT5rBtM0T9sKK+hvIbSP9IgDdIgDdI/mpmZmZmZ0T+7ErUrUbvSP5IkSZIkSdI/1pQ1ZU1Z0z9ddNFFF13UP5Q+6ZM+6dM/OL3pTW960z9MriAmVxDTP6uqqqqqqtI/kiRJkiRJ0j8zMzMzMzPTP9PS0tLS0tI/FDuxEzux0z/BeCv7HFLTP19CewntJdQ/yFOCPCXI0z/btm3btm3TP2cxncV0FtM/Ccs9jbDc0z/vy2MrgobTPzMzMzMzM9M/JkOwjjbv0z+llFJKKaXUP1VVVVVVVdU/AAAAAAAA1T+WWqmVWqnVP1VVVVVVVdU/F341JtID1T+mpaWlpaXVP1VVVVVVVdU/Fl/xFV/x1T9ItMLmQKLVP1VVVVVVVdU/r169evXq1T/yWTeYIp/VP1VVVVVVVdU/2FBeQ3kN1T/sHVMDe8fUP1VVVVVVVdU/ICod2TMQ1T/NzMzMzMzUPwaebs0Pi9Q/S9SuRO1K1D/6tYdcVgzUPyVJkiRJktQ/VFRUVFRU1D8GfUFf0BfUPwnLPY2w3NM/o4suuuii0z83talNbWrTP5Q+6ZM+6dM/VEZlVEZl1D9DFrKQhSzUP6WUUkoppdQ/Ut/ZqO9s1D8mTv2eW+LUP1VVVVVVVdU/Ffji6gcd1T85BS+n4OXUP1VVVVVVVdU/H4XrUbge1T/L8I0oMOnUP7W0tLS0tNQ/k/OyiE8g1T/FTuzETuzUP5VLuZRLudQ/E4y3ss8h1T9RGh+ZQO/UP1VVVVVVVdU/NFIxtzoj1T+HtW9Y+4bVP1VVVVVVVdU/SZIkSZIk1T/DSk8trPTUP1pMZzGdxdQ/SeXDuF+X1D/mnkZY7mnUP9RDPdRDPdQ/J3VfHlsR1D+U3W+U3W/UP0RERERERNQ/69khcGMZ1D8mQ7CONu/TP0vUrkTtStQ/IYQQQggh1D97FK5H4XrUPxRFURRFUdQ/CoVCoVAo1D8AAAAAAADUP/aEPWFP2NM/FDuxEzux0z+Hae6Cv4rTP+GDDz744NM/qzut7rS60z9+NSbSA5XTP/42xajhb9M/S0tLS0tL0z8xNguqPSfTPzDXDsy1A9M/UfxFzrDg0j8zMzMzMzPTP0yuICZXENM/K2wOJFph0z8UO7ETO7HTPwAAAAAAANQ/Ccs9jbDc0z+hQoUKFSrUPwJl4kr3BtQ/RT7rBlPk0z8M1XTMJcLTP6DTBjptoNM/n65P16fr0z+ivIbyGsrTP1T+qFP+qNM/zspPiLPy0z/SExw9wdHTPxQ7sRM7sdM/Qbgb+x6R0z/jJszvanHTP8F4K/scUtM/MzMzMzMz0z9Wigm6qxTTP2gvob2E9tI/n6lcd7zY0j+7ErUrUbvSP54S5ClBntI/",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "customdata": [
- [
- "A paper about AI regulation that was originally su"
- ],
- [
- "If we assume all articles published by Nature in 2"
- ],
- [
- "In Unlambda, what exact charcter or text needs to "
- ],
- [
- "Iโm researching species that became invasive after"
- ],
- [
- "The attached spreadsheet shows the inventory for a"
- ],
- [
- "How many studio albums were published by Mercedes "
- ],
- [
- "If Eliud Kipchoge could maintain his record-making"
- ],
- [
- "The object in the British Museum's collection with"
- ],
- [
- "According to github, when was Regression added to "
- ],
- [
- "Here's a fun riddle that I think you'll enjoy.\n\nYo"
- ],
- [
- "Using the Biopython library in Python, parse the P"
- ],
- [
- "What are the EC numbers of the two most commonly u"
- ],
- [
- "In July 2, 1959 United States standards for grades"
- ],
- [
- "In April of 1977, who was the Prime Minister of th"
- ],
- [
- "Use density measures from the chemistry materials "
- ],
- [
- "What was the volume in m^3 of the fish bag that wa"
- ],
- [
- "What is the average number of pre-2020 works on th"
- ],
- [
- "In the video https://www.youtube.com/watch?v=L1vXC"
- ],
- [
- "Of the authors (First M. Last) that worked on the "
- ],
- [
- "When you take the average of the standard populati"
- ],
- [
- "Assuming scientists in the famous youtube video Th"
- ],
- [
- "In Series 9, Episode 11 of Doctor Who, the Doctor "
- ],
- [
- "In terms of geographical distance between capital "
- ],
- [
- "In the NCATS PubChem compound database for Food Ad"
- ],
- [
- "I need to fact-check a citation. This is the citat"
- ],
- [
- "Which contributor to the version of OpenCV where s"
- ],
- [
- "What integer-rounded percentage of the total lengt"
- ],
- [
- "An office held a Secret Santa gift exchange where "
- ],
- [
- "What is the maximum length in meters of #9 in the "
- ],
- [
- "What two-word type of model did Manash Pratim Kash"
- ],
- [
- "What animals that were mentioned in both Ilias Lag"
- ],
- [
- "How many High Energy Physics - Lattice articles li"
- ],
- [
- "The photograph in the Whitney Museum of American A"
- ],
- [
- ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti"
- ],
- [
- "What is the minimum number of page links a person "
- ],
- [
- "Each cell in the attached spreadsheet represents a"
- ],
- [
- "Which of the text elements under CATEGORIES in the"
- ],
- [
- "I went to Virtue restaurant & bar in Chicago for m"
- ],
- [
- "ยฌ(A โง B) โ (ยฌA โจ ยฌB)\nยฌ(A โจ B) โ (ยฌA โง ยฌB)\n(A โ B) "
- ],
- [
- "My family reunion is this week, and I was assigned"
- ],
- [
- "In Emily Midkiff's June 2014 article in a journal "
- ],
- [
- "It is 1999. Before you party like it is 1999, plea"
- ],
- [
- "Under DDC 633 on Bielefeld University Library's BA"
- ]
- ],
- "hovertemplate": "agent_name=code_qwen-coder-32B_03_february_text index=%{x} is_correct=%{y} question=%{customdata[0]}",
- "legendgroup": "code_qwen-coder-32B_03_february_text",
- "line": {
- "color": "#EF553B",
- "dash": "solid"
- },
- "marker": {
- "symbol": "circle"
- },
- "mode": "lines",
- "name": "code_qwen-coder-32B_03_february_text",
- "showlegend": true,
- "type": "scattergl",
- "x": {
- "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKg==",
- "dtype": "i1"
- },
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnZP1VVVVVVVdU/kiRJkiRJ0j8AAAAAAADQPxzHcRzHccw/mpmZmZmZyT9GF1100UXHP1VVVVVVVcU/FDuxEzuxwz+SJEmSJEnCP5qZmZmZmck/AAAAAAAA0D8eHh4eHh7OPxzHcRzHccw/KK+hvIbyyj+amZmZmZnJP57neZ7nec4/F1100UUXzT+96U1vetPLP6uqqqqqqso/mpmZmZmZyT/ZiZ3YiZ3IP0J7Ce0ltMc/t23btm3bxj98GmG5pxHGP1VVVVVVVcU/pZRSSimlxD8AAAAAAADEP2WTTTbZZMM/l5aWlpaWxj8WX/EVX/HFPzmO4ziO48g/doMp8lk3yD9DeQ3lNZTHPxqkQRqkQco/zczMzMzMzD8ZnI/B+RjMP9u2bdu2bcs/s6asKWvKyj8=",
- "dtype": "f8"
- },
- "yaxis": "y"
- }
- ],
- "layout": {
- "legend": {
- "title": {
- "text": "agent_name"
- },
- "tracegroupgap": 0
- },
- "margin": {
- "t": 60
- },
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "#2a3f5f"
- },
- "error_y": {
- "color": "#2a3f5f"
- },
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "baxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "histogram": [
- {
- "marker": {
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "fillpattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermap": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermap"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "#EBF0F8"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "#C8D4E3"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowcolor": "#2a3f5f",
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "#8e0152"
- ],
- [
- 0.1,
- "#c51b7d"
- ],
- [
- 0.2,
- "#de77ae"
- ],
- [
- 0.3,
- "#f1b6da"
- ],
- [
- 0.4,
- "#fde0ef"
- ],
- [
- 0.5,
- "#f7f7f7"
- ],
- [
- 0.6,
- "#e6f5d0"
- ],
- [
- 0.7,
- "#b8e186"
- ],
- [
- 0.8,
- "#7fbc41"
- ],
- [
- 0.9,
- "#4d9221"
- ],
- [
- 1,
- "#276419"
- ]
- ],
- "sequential": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ]
- },
- "colorway": [
- "#636efa",
- "#EF553B",
- "#00cc96",
- "#ab63fa",
- "#FFA15A",
- "#19d3f3",
- "#FF6692",
- "#B6E880",
- "#FF97FF",
- "#FECB52"
- ],
- "font": {
- "color": "#2a3f5f"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "#E5ECF6",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "white"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "#E5ECF6",
- "polar": {
- "angularaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "radialaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "yaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "zaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- }
- },
- "shapedefaults": {
- "line": {
- "color": "#2a3f5f"
- }
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "baxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "caxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- }
- }
- },
- "xaxis": {
- "anchor": "y",
- "domain": [
- 0,
- 1
- ],
- "title": {
- "text": "index"
- }
- },
- "yaxis": {
- "anchor": "x",
- "domain": [
- 0,
- 1
- ],
- "title": {
- "text": "is_correct"
- }
- }
- }
- }
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
@@ -10788,9 +245,6 @@
"\n",
"\n",
"cumulative_df[\"question\"] = cumulative_df.apply(find_question, axis=1)\n",
- "# cumulative_df[\"question\"] = [el[:50] for el in sel_df[\"question\"].values]\n",
- "\n",
- "# cumulative_df[\"is_correct\"] = cumulative_df[\"is_correct\"] * (165 - 68) / 165\n",
"\n",
"px.line(\n",
" cumulative_df,\n",
@@ -10810,19 +264,11 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "165\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "sel_df = result_df.loc[result_df[\"agent_name\"] == o1]\n",
+ "sel_df = result_df.loc[result_df[\"agent_name\"] == \"o1\"]\n",
"print(len(sel_df))"
]
},
@@ -10835,56 +281,9 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
- "\n",
- "\n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- "\n",
- "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
- "\n",
- "\n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- "\n",
- "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
- "\n",
- "\n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- "\n",
- "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n",
- "\n",
- "\n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- "\n",
- "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:11: SettingWithCopyWarning:\n",
- "\n",
- "\n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import numpy as np\n",
"\n",
@@ -10916,890 +315,9 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.plotly.v1+json": {
- "config": {
- "plotlyServerURL": "https://plot.ly"
- },
- "data": [
- {
- "hovertemplate": "is_correct=False variable=%{x} Average count=%{y}",
- "legendgroup": "False",
- "marker": {
- "color": "#636efa",
- "pattern": {
- "shape": ""
- }
- },
- "name": "False",
- "orientation": "v",
- "showlegend": true,
- "textposition": "outside",
- "type": "bar",
- "x": [
- "AgentParsingError",
- "AgentExecutionError",
- "AgentMaxIterationsError",
- "AgentGenerationError",
- "Count steps"
- ],
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACSJEmSJEkMQA==",
- "dtype": "f8"
- },
- "yaxis": "y"
- },
- {
- "hovertemplate": "is_correct=True variable=%{x} Average count=%{y}",
- "legendgroup": "True",
- "marker": {
- "color": "#EF553B",
- "pattern": {
- "shape": ""
- }
- },
- "name": "True",
- "orientation": "v",
- "showlegend": true,
- "textposition": "outside",
- "type": "bar",
- "x": [
- "AgentParsingError",
- "AgentExecutionError",
- "AgentMaxIterationsError",
- "AgentGenerationError",
- "Count steps"
- ],
- "xaxis": "x",
- "y": {
- "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABPt+aHRcoIQA==",
- "dtype": "f8"
- },
- "yaxis": "y"
- }
- ],
- "layout": {
- "bargroupgap": 0,
- "barmode": "group",
- "height": 500,
- "legend": {
- "title": {
- "text": "is_correct"
- },
- "tracegroupgap": 0
- },
- "margin": {
- "t": 60
- },
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "#2a3f5f"
- },
- "error_y": {
- "color": "#2a3f5f"
- },
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- },
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "baxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "histogram": [
- {
- "marker": {
- "pattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "fillpattern": {
- "fillmode": "overlay",
- "size": 10,
- "solidity": 0.2
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermap": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermap"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "#EBF0F8"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "#C8D4E3"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowcolor": "#2a3f5f",
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "#8e0152"
- ],
- [
- 0.1,
- "#c51b7d"
- ],
- [
- 0.2,
- "#de77ae"
- ],
- [
- 0.3,
- "#f1b6da"
- ],
- [
- 0.4,
- "#fde0ef"
- ],
- [
- 0.5,
- "#f7f7f7"
- ],
- [
- 0.6,
- "#e6f5d0"
- ],
- [
- 0.7,
- "#b8e186"
- ],
- [
- 0.8,
- "#7fbc41"
- ],
- [
- 0.9,
- "#4d9221"
- ],
- [
- 1,
- "#276419"
- ]
- ],
- "sequential": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ]
- },
- "colorway": [
- "#636efa",
- "#EF553B",
- "#00cc96",
- "#ab63fa",
- "#FFA15A",
- "#19d3f3",
- "#FF6692",
- "#B6E880",
- "#FF97FF",
- "#FECB52"
- ],
- "font": {
- "color": "#2a3f5f"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "#E5ECF6",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "white"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "#E5ECF6",
- "polar": {
- "angularaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "radialaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "yaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "zaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- }
- },
- "shapedefaults": {
- "line": {
- "color": "#2a3f5f"
- }
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "baxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "caxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- }
- }
- },
- "width": 800,
- "xaxis": {
- "anchor": "y",
- "domain": [
- 0,
- 1
- ],
- "title": {
- "text": "variable"
- }
- },
- "yaxis": {
- "anchor": "x",
- "domain": [
- 0,
- 1
- ],
- "title": {
- "text": "Average count"
- }
- }
- }
- }
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
@@ -11841,153 +359,9 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
is_correct
\n",
- "
count_steps
\n",
- "
question
\n",
- "
\n",
- "
\n",
- "
attachment_type
\n",
- "
\n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
None
\n",
- "
0.423799
\n",
- "
4.959725
\n",
- "
2185
\n",
- "
\n",
- "
\n",
- "
csv
\n",
- "
0.000000
\n",
- "
7.750000
\n",
- "
16
\n",
- "
\n",
- "
\n",
- "
docx
\n",
- "
0.571429
\n",
- "
4.904762
\n",
- "
21
\n",
- "
\n",
- "
\n",
- "
jpg
\n",
- "
0.142857
\n",
- "
5.750000
\n",
- "
28
\n",
- "
\n",
- "
\n",
- "
jsonld
\n",
- "
0.000000
\n",
- "
6.600000
\n",
- "
15
\n",
- "
\n",
- "
\n",
- "
mp3
\n",
- "
0.480000
\n",
- "
4.500000
\n",
- "
50
\n",
- "
\n",
- "
\n",
- "
pdb
\n",
- "
0.000000
\n",
- "
4.444444
\n",
- "
18
\n",
- "
\n",
- "
\n",
- "
pdf
\n",
- "
0.588235
\n",
- "
4.137255
\n",
- "
51
\n",
- "
\n",
- "
\n",
- "
png
\n",
- "
0.216783
\n",
- "
4.412587
\n",
- "
143
\n",
- "
\n",
- "
\n",
- "
pptx
\n",
- "
0.882353
\n",
- "
4.058824
\n",
- "
17
\n",
- "
\n",
- "
\n",
- "
py
\n",
- "
1.000000
\n",
- "
4.266667
\n",
- "
15
\n",
- "
\n",
- "
\n",
- "
txt
\n",
- "
0.705882
\n",
- "
4.764706
\n",
- "
17
\n",
- "
\n",
- "
\n",
- "
xlsx
\n",
- "
0.612745
\n",
- "
4.823529
\n",
- "
204
\n",
- "
\n",
- "
\n",
- "
zip
\n",
- "
0.448276
\n",
- "
5.344828
\n",
- "
29
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " is_correct count_steps question\n",
- "attachment_type \n",
- "None 0.423799 4.959725 2185\n",
- "csv 0.000000 7.750000 16\n",
- "docx 0.571429 4.904762 21\n",
- "jpg 0.142857 5.750000 28\n",
- "jsonld 0.000000 6.600000 15\n",
- "mp3 0.480000 4.500000 50\n",
- "pdb 0.000000 4.444444 18\n",
- "pdf 0.588235 4.137255 51\n",
- "png 0.216783 4.412587 143\n",
- "pptx 0.882353 4.058824 17\n",
- "py 1.000000 4.266667 15\n",
- "txt 0.705882 4.764706 17\n",
- "xlsx 0.612745 4.823529 204\n",
- "zip 0.448276 5.344828 29"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"display(\n",
" result_df.groupby([\"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n",
@@ -12005,7 +379,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -12015,52 +389,9 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "agent_name\n",
- "code_gpt4o_03_february_goodoldtext-unbroken 38.36\n",
- "code_gpt4o_03_february_magenticbrowser 35.22\n",
- "code_gpt4o_03_february_magenticbrowser2 36.54\n",
- "code_gpt4o_03_february_text 37.58\n",
- "code_o1_01_february_text 49.09\n",
- "code_o1_03_february_ablation-toolcalling-manager 32.73\n",
- "code_o1_03_february_fix-print-outputs 51.83\n",
- "code_o1_03_february_fix-print-outputs2 55.77\n",
- "code_o1_03_february_goodoldtext-unbroken 53.42\n",
- "code_o1_03_february_remove-navigational 53.66\n",
- "code_o1_03_february_text_high-reasoning-effort 48.48\n",
- "code_o1_04_february_submission 49.38\n",
- "code_o1_04_february_submission5 55.15\n",
- "code_o3-mini_03_february_remove-navigational 29.09\n",
- "Name: is_correct, dtype: float64"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Majority score: 58.18\n",
- "Oracle score: 72.73\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/3287428472.py:20: DeprecationWarning:\n",
- "\n",
- "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"def majority_vote(df):\n",
" df = df[(df[\"prediction\"] != \"Unable to determine\") & (~df[\"prediction\"].isna()) & (df[\"prediction\"] != \"None\")]\n",
@@ -12100,7 +431,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -12112,7 +443,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -12129,9 +460,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "gaia",
+ "display_name": "test",
"language": "python",
- "name": "python3"
+ "name": "test"
},
"language_info": {
"codemirror_mode": {
diff --git a/examples/open_deep_research/app.py b/examples/open_deep_research/app.py
new file mode 100644
index 000000000..a7f884faa
--- /dev/null
+++ b/examples/open_deep_research/app.py
@@ -0,0 +1,11 @@
+from run import create_agent
+
+from smolagents.gradio_ui import GradioUI
+
+
+agent = create_agent()
+
+demo = GradioUI(agent)
+
+if __name__ == "__main__":
+ demo.launch()
diff --git a/examples/open_deep_research/requirements.txt b/examples/open_deep_research/requirements.txt
index a18936ae4..4fe0e0e2a 100644
--- a/examples/open_deep_research/requirements.txt
+++ b/examples/open_deep_research/requirements.txt
@@ -1,4 +1,5 @@
anthropic>=0.37.1
+audioop-lts<1.0; python_version >= "3.13" # required to use pydub in Python >=3.13; LTS port of the removed Python builtin module audioop
beautifulsoup4>=4.12.3
datasets>=2.21.0
google_search_results>=2.4.2
diff --git a/examples/open_deep_research/run.py b/examples/open_deep_research/run.py
index 2dcddab4f..be1ad38a5 100644
--- a/examples/open_deep_research/run.py
+++ b/examples/open_deep_research/run.py
@@ -11,7 +11,6 @@
FindNextTool,
PageDownTool,
PageUpTool,
- SearchInformationTool,
SimpleTextBrowser,
VisitTool,
)
@@ -19,38 +18,13 @@
from smolagents import (
CodeAgent,
- # HfApiModel,
+ GoogleSearchTool,
+ # InferenceClientModel,
LiteLLMModel,
ToolCallingAgent,
)
-AUTHORIZED_IMPORTS = [
- "requests",
- "zipfile",
- "os",
- "pandas",
- "numpy",
- "sympy",
- "json",
- "bs4",
- "pubchempy",
- "xml",
- "yahoo_finance",
- "Bio",
- "sklearn",
- "scipy",
- "pydub",
- "io",
- "PIL",
- "chess",
- "PyPDF2",
- "pptx",
- "torch",
- "datetime",
- "fractions",
- "csv",
-]
load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))
@@ -83,22 +57,20 @@ def parse_args():
os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
-def main():
- args = parse_args()
- text_limit = 100000
-
- model = LiteLLMModel(
- args.model_id,
- custom_role_conversions=custom_role_conversions,
- max_completion_tokens=8192,
- reasoning_effort="high",
- )
- document_inspection_tool = TextInspectorTool(model, text_limit)
+def create_agent(model_id="o1"):
+ model_params = {
+ "model_id": model_id,
+ "custom_role_conversions": custom_role_conversions,
+ "max_completion_tokens": 8192,
+ }
+ if model_id == "o1":
+ model_params["reasoning_effort"] = "high"
+ model = LiteLLMModel(**model_params)
+ text_limit = 100000
browser = SimpleTextBrowser(**BROWSER_CONFIG)
-
WEB_TOOLS = [
- SearchInformationTool(browser),
+ GoogleSearchTool(provider="serper"),
VisitTool(browser),
PageUpTool(browser),
PageDownTool(browser),
@@ -107,7 +79,6 @@ def main():
ArchiveSearchTool(browser),
TextInspectorTool(model, text_limit),
]
-
text_webbrowser_agent = ToolCallingAgent(
model=model,
tools=WEB_TOOLS,
@@ -129,15 +100,23 @@ def main():
manager_agent = CodeAgent(
model=model,
- tools=[visualizer, document_inspection_tool],
+ tools=[visualizer, TextInspectorTool(model, text_limit)],
max_steps=12,
verbosity_level=2,
- additional_authorized_imports=AUTHORIZED_IMPORTS,
+ additional_authorized_imports=["*"],
planning_interval=4,
managed_agents=[text_webbrowser_agent],
)
- answer = manager_agent.run(args.question)
+ return manager_agent
+
+
+def main():
+ args = parse_args()
+
+ agent = create_agent(model_id=args.model_id)
+
+ answer = agent.run(args.question)
print(f"Got this answer: {answer}")
diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py
index fa59fc03e..192081787 100644
--- a/examples/open_deep_research/run_gaia.py
+++ b/examples/open_deep_research/run_gaia.py
@@ -1,3 +1,4 @@
+# EXAMPLE COMMAND: python examples/open_deep_research/run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o
import argparse
import json
import os
@@ -5,7 +6,6 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
-from typing import List
import datasets
import pandas as pd
@@ -23,7 +23,6 @@
FindNextTool,
PageDownTool,
PageUpTool,
- SearchInformationTool,
SimpleTextBrowser,
VisitTool,
)
@@ -32,39 +31,13 @@
from smolagents import (
CodeAgent,
- # HfApiModel,
+ GoogleSearchTool,
LiteLLMModel,
Model,
ToolCallingAgent,
)
-AUTHORIZED_IMPORTS = [
- "requests",
- "zipfile",
- "os",
- "pandas",
- "numpy",
- "sympy",
- "json",
- "bs4",
- "pubchempy",
- "xml",
- "yahoo_finance",
- "Bio",
- "sklearn",
- "scipy",
- "pydub",
- "io",
- "PIL",
- "chess",
- "PyPDF2",
- "pptx",
- "torch",
- "datetime",
- "fractions",
- "csv",
-]
load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))
@@ -121,14 +94,14 @@ def preprocess_file_paths(row):
os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
-def create_agent_hierarchy(model: Model):
+def create_agent_team(model: Model):
text_limit = 100000
ti_tool = TextInspectorTool(model, text_limit)
browser = SimpleTextBrowser(**BROWSER_CONFIG)
WEB_TOOLS = [
- SearchInformationTool(browser),
+ GoogleSearchTool(provider="serper"),
VisitTool(browser),
PageUpTool(browser),
PageDownTool(browser),
@@ -137,6 +110,7 @@ def create_agent_hierarchy(model: Model):
ArchiveSearchTool(browser),
TextInspectorTool(model, text_limit),
]
+
text_webbrowser_agent = ToolCallingAgent(
model=model,
tools=WEB_TOOLS,
@@ -161,7 +135,7 @@ def create_agent_hierarchy(model: Model):
tools=[visualizer, ti_tool],
max_steps=12,
verbosity_level=2,
- additional_authorized_imports=AUTHORIZED_IMPORTS,
+ additional_authorized_imports=["*"],
planning_interval=4,
managed_agents=[text_webbrowser_agent],
)
@@ -178,21 +152,20 @@ def append_answer(entry: dict, jsonl_file: str) -> None:
def answer_single_question(example, model_id, answers_file, visual_inspection_tool):
- model = LiteLLMModel(
- model_id,
- custom_role_conversions=custom_role_conversions,
- max_completion_tokens=8192,
- reasoning_effort="high",
- )
- # model = HfApiModel("Qwen/Qwen2.5-72B-Instruct", provider="together")
- # "https://lnxyuvj02bpe6mam.us-east-1.aws.endpoints.huggingface.cloud",
- # custom_role_conversions=custom_role_conversions,
- # # provider="sambanova",
- # max_tokens=8096,
- # )
+ model_params = {
+ "model_id": model_id,
+ "custom_role_conversions": custom_role_conversions,
+ }
+ if model_id == "o1":
+ model_params["reasoning_effort"] = "high"
+ model_params["max_completion_tokens"] = 8192
+ else:
+ model_params["max_tokens"] = 4096
+ model = LiteLLMModel(**model_params)
+ # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together", max_tokens=4096)
document_inspection_tool = TextInspectorTool(model, 100000)
- agent = create_agent_hierarchy(model)
+ agent = create_agent_team(model)
augmented_question = """You have one question to answer. It is paramount that you provide a correct answer.
Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
@@ -218,14 +191,14 @@ def answer_single_question(example, model_id, answers_file, visual_inspection_to
# Run agent ๐
final_result = agent.run(augmented_question)
- agent_memory = agent.write_memory_to_messages(summary_mode=True)
+ agent_memory = agent.write_memory_to_messages()
final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model)
output = str(final_result)
for memory_step in agent.memory.steps:
memory_step.model_input_messages = None
- intermediate_steps = [str(step) for step in agent.memory.steps]
+ intermediate_steps = agent_memory
# Check for parsing errors which indicate the LLM failed to follow the required format
parsing_error = True if any(["AgentParsingError" in step for step in intermediate_steps]) else False
@@ -243,6 +216,12 @@ def answer_single_question(example, model_id, answers_file, visual_inspection_to
exception = e
raised_exception = True
end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ token_counts_manager = agent.monitor.get_total_token_counts()
+ token_counts_web = list(agent.managed_agents.values())[0].monitor.get_total_token_counts()
+ total_token_counts = {
+ "input": token_counts_manager["input"] + token_counts_web["input"],
+ "output": token_counts_manager["output"] + token_counts_web["output"],
+ }
annotated_example = {
"agent_name": model.model_id,
"question": example["question"],
@@ -252,16 +231,17 @@ def answer_single_question(example, model_id, answers_file, visual_inspection_to
"parsing_error": parsing_error,
"iteration_limit_exceeded": iteration_limit_exceeded,
"agent_error": str(exception) if raised_exception else None,
- "start_time": start_time,
- "end_time": end_time,
"task": example["task"],
"task_id": example["task_id"],
"true_answer": example["true_answer"],
+ "start_time": start_time,
+ "end_time": end_time,
+ "token_counts": total_token_counts,
}
append_answer(annotated_example, answers_file)
-def get_examples_to_answer(answers_file, eval_ds) -> List[dict]:
+def get_examples_to_answer(answers_file, eval_ds) -> list[dict]:
print(f"Loading answers from {answers_file}...")
try:
done_questions = pd.read_json(answers_file, lines=True)["question"].tolist()
diff --git a/examples/open_deep_research/scripts/mdconvert.py b/examples/open_deep_research/scripts/mdconvert.py
index 68f13a28b..939cd121a 100644
--- a/examples/open_deep_research/scripts/mdconvert.py
+++ b/examples/open_deep_research/scripts/mdconvert.py
@@ -14,7 +14,7 @@
import tempfile
import traceback
import zipfile
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
import mammoth
@@ -112,22 +112,22 @@ def convert_soup(self, soup: Any) -> str:
class DocumentConverterResult:
"""The result of converting a document to text."""
- def __init__(self, title: Union[str, None] = None, text_content: str = ""):
- self.title: Union[str, None] = title
+ def __init__(self, title: str | None = None, text_content: str = ""):
+ self.title: str | None = title
self.text_content: str = text_content
class DocumentConverter:
"""Abstract superclass of all DocumentConverters."""
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
raise NotImplementedError()
class PlainTextConverter(DocumentConverter):
"""Anything with content type text/plain"""
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
# Guess the content type from any file extension that might be around
content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
@@ -149,7 +149,7 @@ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConvert
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
# Bail if not html
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]:
@@ -161,7 +161,7 @@ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConvert
return result
- def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+ def _convert(self, html_content: str) -> None | DocumentConverterResult:
"""Helper function that converts and HTML string."""
# Parse the string
@@ -189,7 +189,7 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
# Bail if not Wikipedia
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]:
@@ -234,7 +234,7 @@ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConvert
class YouTubeConverter(DocumentConverter):
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
# Bail if not YouTube
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".html", ".htm"]:
@@ -250,7 +250,7 @@ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConvert
# Read the meta tags
assert soup.title is not None and soup.title.string is not None
- metadata: Dict[str, str] = {"title": soup.title.string}
+ metadata: dict[str, str] = {"title": soup.title.string}
for meta in soup(["meta"]):
for a in meta.attrs:
if a in ["itemprop", "property", "name"]:
@@ -328,13 +328,13 @@ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConvert
text_content=webpage_text,
)
- def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
+ def _get(self, metadata: dict[str, str], keys: list[str], default: str | None = None) -> str | None:
for k in keys:
if k in metadata:
return metadata[k]
return default
- def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
+ def _findKey(self, json: Any, key: str) -> str | None: # TODO: Fix json type
if isinstance(json, list):
for elm in json:
ret = self._findKey(elm, key)
@@ -356,7 +356,7 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
@@ -373,7 +373,7 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx":
@@ -393,7 +393,7 @@ class XlsxConverter(HtmlConverter):
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".xlsx", ".xls"]:
@@ -417,7 +417,7 @@ class PptxConverter(HtmlConverter):
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a PPTX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pptx":
@@ -520,7 +520,7 @@ class WavConverter(MediaConverter):
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".wav":
@@ -570,7 +570,7 @@ class Mp3Converter(WavConverter):
Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a MP3
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".mp3", ".m4a"]:
@@ -644,7 +644,7 @@ def __init__(self, extract_dir: str = "downloads"):
# Create the extraction directory if it doesn't exist
os.makedirs(self.extract_dir, exist_ok=True)
- def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
# Bail if not a ZIP file
extension = kwargs.get("file_extension", "")
if extension.lower() != ".zip":
@@ -681,7 +681,7 @@ class ImageConverter(MediaConverter):
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
"""
- def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+ def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
# Bail if not a XLSX
extension = kwargs.get("file_extension", "")
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
@@ -771,9 +771,9 @@ class MarkdownConverter:
def __init__(
self,
- requests_session: Optional[requests.Session] = None,
- mlm_client: Optional[Any] = None,
- mlm_model: Optional[Any] = None,
+ requests_session: requests.Session | None = None,
+ mlm_client: Any | None = None,
+ mlm_model: Any | None = None,
):
if requests_session is None:
self._requests_session = requests.Session()
@@ -783,7 +783,7 @@ def __init__(
self._mlm_client = mlm_client
self._mlm_model = mlm_model
- self._page_converters: List[DocumentConverter] = []
+ self._page_converters: list[DocumentConverter] = []
# Register converters for successful browsing operations
# Later registrations are tried first / take higher priority than earlier registrations
@@ -802,7 +802,7 @@ def __init__(
self.register_page_converter(PdfConverter())
def convert(
- self, source: Union[str, requests.Response], **kwargs: Any
+ self, source: str | requests.Response, **kwargs: Any
) -> DocumentConverterResult: # TODO: deal with kwargs
"""
Args:
@@ -924,7 +924,7 @@ def convert_response(
return result
- def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
+ def _convert(self, local_path: str, extensions: list[str | None], **kwargs) -> DocumentConverterResult:
error_trace = ""
for ext in extensions + [None]: # Try last with no extension
for converter in self._page_converters:
diff --git a/examples/open_deep_research/scripts/text_inspector_tool.py b/examples/open_deep_research/scripts/text_inspector_tool.py
index 056168cee..2b1e18ae5 100644
--- a/examples/open_deep_research/scripts/text_inspector_tool.py
+++ b/examples/open_deep_research/scripts/text_inspector_tool.py
@@ -1,9 +1,5 @@
-from typing import Optional
-
from smolagents import Tool
-from smolagents.models import MessageRole, Model
-
-from .mdconvert import MarkdownConverter
+from smolagents.models import Model
class TextInspectorTool(Tool):
@@ -24,14 +20,18 @@ class TextInspectorTool(Tool):
},
}
output_type = "string"
- md_converter = MarkdownConverter()
- def __init__(self, model: Model, text_limit: int):
+ def __init__(self, model: Model = None, text_limit: int = 100000):
super().__init__()
self.model = model
self.text_limit = text_limit
+ from .mdconvert import MarkdownConverter
+
+ self.md_converter = MarkdownConverter()
def forward_initial_exam_mode(self, file_path, question):
+ from smolagents.models import MessageRole
+
result = self.md_converter.convert(file_path)
if file_path[-4:] in [".png", ".jpg"]:
@@ -73,7 +73,9 @@ def forward_initial_exam_mode(self, file_path, question):
]
return self.model(messages).content
- def forward(self, file_path, question: Optional[str] = None) -> str:
+ def forward(self, file_path, question: str | None = None) -> str:
+ from smolagents.models import MessageRole
+
result = self.md_converter.convert(file_path)
if file_path[-4:] in [".png", ".jpg"]:
diff --git a/examples/open_deep_research/scripts/text_web_browser.py b/examples/open_deep_research/scripts/text_web_browser.py
index ef40f8551..044128edb 100644
--- a/examples/open_deep_research/scripts/text_web_browser.py
+++ b/examples/open_deep_research/scripts/text_web_browser.py
@@ -6,7 +6,7 @@
import re
import time
import uuid
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any
from urllib.parse import unquote, urljoin, urlparse
import pathvalidate
@@ -24,19 +24,19 @@ class SimpleTextBrowser:
def __init__(
self,
- start_page: Optional[str] = None,
- viewport_size: Optional[int] = 1024 * 8,
- downloads_folder: Optional[Union[str, None]] = None,
- serpapi_key: Optional[Union[str, None]] = None,
- request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
+ start_page: str | None = None,
+ viewport_size: int | None = 1024 * 8,
+ downloads_folder: str | None | None = None,
+ serpapi_key: str | None | None = None,
+ request_kwargs: dict[str, Any] | None | None = None,
):
self.start_page: str = start_page if start_page else "about:blank"
self.viewport_size = viewport_size # Applies only to the standard uri types
self.downloads_folder = downloads_folder
- self.history: List[Tuple[str, float]] = list()
- self.page_title: Optional[str] = None
+ self.history: list[tuple[str, float]] = list()
+ self.page_title: str | None = None
self.viewport_current_page = 0
- self.viewport_pages: List[Tuple[int, int]] = list()
+ self.viewport_pages: list[tuple[int, int]] = list()
self.set_address(self.start_page)
self.serpapi_key = serpapi_key
self.request_kwargs = request_kwargs
@@ -44,15 +44,15 @@ def __init__(
self._mdconvert = MarkdownConverter()
self._page_content: str = ""
- self._find_on_page_query: Union[str, None] = None
- self._find_on_page_last_result: Union[int, None] = None # Location of the last result
+ self._find_on_page_query: str | None = None
+ self._find_on_page_last_result: int | None = None # Location of the last result
@property
def address(self) -> str:
"""Return the address of the current page."""
return self.history[-1][0]
- def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
+ def set_address(self, uri_or_path: str, filter_year: int | None = None) -> None:
# TODO: Handle anchors
self.history.append((uri_or_path, time.time()))
@@ -102,7 +102,7 @@ def page_down(self) -> None:
def page_up(self) -> None:
self.viewport_current_page = max(self.viewport_current_page - 1, 0)
- def find_on_page(self, query: str) -> Union[str, None]:
+ def find_on_page(self, query: str) -> str | None:
"""Searches for the query from the current viewport forward, looping back to the start if necessary."""
# Did we get here via a previous find_on_page search with the same query?
@@ -121,7 +121,7 @@ def find_on_page(self, query: str) -> Union[str, None]:
self._find_on_page_last_result = viewport_match
return self.viewport
- def find_next(self) -> Union[str, None]:
+ def find_next(self) -> str | None:
"""Scroll to the next viewport that matches the query"""
if self._find_on_page_query is None:
@@ -144,7 +144,7 @@ def find_next(self) -> Union[str, None]:
self._find_on_page_last_result = viewport_match
return self.viewport
- def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> int | None:
"""Search for matches between the starting viewport looping when reaching the end."""
if query is None:
@@ -174,7 +174,7 @@ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int,
return None
- def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
+ def visit_page(self, path_or_uri: str, filter_year: int | None = None) -> str:
"""Update the address, visit the page, and return the content of the viewport."""
self.set_address(path_or_uri, filter_year=filter_year)
return self.viewport
@@ -201,7 +201,7 @@ def _split_pages(self) -> None:
self.viewport_pages.append((start_idx, end_idx))
start_idx = end_idx
- def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
+ def _serpapi_search(self, query: str, filter_year: int | None = None) -> None:
if self.serpapi_key is None:
raise ValueError("Missing SerpAPI key.")
@@ -231,7 +231,7 @@ def _prev_visit(url):
return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
return ""
- web_snippets: List[str] = list()
+ web_snippets: list[str] = list()
idx = 0
if "organic_results" in results:
for page in results["organic_results"]:
@@ -352,7 +352,7 @@ def _fetch_page(self, url: str) -> None:
self.page_title = "Error"
self._set_page_content(f"## Error\n\n{str(request_exception)}")
- def _state(self) -> Tuple[str, str]:
+ def _state(self) -> tuple[str, str]:
header = f"Address: {self.address}\n"
if self.page_title is not None:
header += f"Title: {self.page_title}\n"
@@ -385,7 +385,7 @@ def __init__(self, browser):
super().__init__()
self.browser = browser
- def forward(self, query: str, filter_year: Optional[int] = None) -> str:
+ def forward(self, query: str, filter_year: int | None = None) -> str:
self.browser.visit_page(f"google: {query}", filter_year=filter_year)
header, content = self.browser._state()
return header.strip() + "\n=======================\n" + content
@@ -397,7 +397,7 @@ class VisitTool(Tool):
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}}
output_type = "string"
- def __init__(self, browser):
+ def __init__(self, browser=None):
super().__init__()
self.browser = browser
@@ -421,6 +421,8 @@ def __init__(self, browser):
self.browser = browser
def forward(self, url: str) -> str:
+ import requests
+
if "arxiv" in url:
url = url.replace("abs", "pdf")
response = requests.get(url)
@@ -452,11 +454,13 @@ class ArchiveSearchTool(Tool):
}
output_type = "string"
- def __init__(self, browser):
+ def __init__(self, browser=None):
super().__init__()
self.browser = browser
def forward(self, url, date) -> str:
+ import requests
+
no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
archive_url = no_timestamp_url + f"×tamp={date}"
response = requests.get(archive_url).json()
@@ -487,7 +491,7 @@ class PageUpTool(Tool):
inputs = {}
output_type = "string"
- def __init__(self, browser):
+ def __init__(self, browser=None):
super().__init__()
self.browser = browser
@@ -505,7 +509,7 @@ class PageDownTool(Tool):
inputs = {}
output_type = "string"
- def __init__(self, browser):
+ def __init__(self, browser=None):
super().__init__()
self.browser = browser
@@ -526,7 +530,7 @@ class FinderTool(Tool):
}
output_type = "string"
- def __init__(self, browser):
+ def __init__(self, browser=None):
super().__init__()
self.browser = browser
@@ -549,7 +553,7 @@ class FindNextTool(Tool):
inputs = {}
output_type = "string"
- def __init__(self, browser):
+ def __init__(self, browser=None):
super().__init__()
self.browser = browser
diff --git a/examples/open_deep_research/scripts/visual_qa.py b/examples/open_deep_research/scripts/visual_qa.py
index 84d240b66..01d60b30a 100644
--- a/examples/open_deep_research/scripts/visual_qa.py
+++ b/examples/open_deep_research/scripts/visual_qa.py
@@ -4,23 +4,21 @@
import os
import uuid
from io import BytesIO
-from typing import Optional
+import PIL.Image
import requests
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
-from PIL import Image
-from transformers import AutoProcessor
from smolagents import Tool, tool
load_dotenv(override=True)
-idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
-
def process_images_and_text(image_path, query, client):
+ from transformers import AutoProcessor
+
messages = [
{
"role": "user",
@@ -30,7 +28,7 @@ def process_images_and_text(image_path, query, client):
],
},
]
-
+ idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
# load images from local directory
@@ -38,7 +36,7 @@ def process_images_and_text(image_path, query, client):
# encode images to strings which can be sent to the endpoint
def encode_local_image(image_path):
# load image
- image = Image.open(image_path).convert("RGB")
+ image = PIL.Image.open(image_path).convert("RGB")
# Convert the image to a base64 string
buffer = BytesIO()
@@ -95,11 +93,8 @@ def encode_image(image_path):
return base64.b64encode(image_file.read()).decode("utf-8")
-headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
-
-
def resize_image(image_path):
- img = Image.open(image_path)
+ img = PIL.Image.open(image_path)
width, height = img.size
img = img.resize((int(width / 2), int(height / 2)))
new_image_path = f"resized_{image_path}"
@@ -121,7 +116,7 @@ class VisualQATool(Tool):
client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
- def forward(self, image_path: str, question: Optional[str] = None) -> str:
+ def forward(self, image_path: str, question: str | None = None) -> str:
output = ""
add_note = False
if not question:
@@ -144,13 +139,19 @@ def forward(self, image_path: str, question: Optional[str] = None) -> str:
@tool
-def visualizer(image_path: str, question: Optional[str] = None) -> str:
+def visualizer(image_path: str, question: str | None = None) -> str:
"""A tool that can answer questions about attached images.
Args:
image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
question: The question to answer.
"""
+ import mimetypes
+ import os
+
+ import requests
+
+ from .visual_qa import encode_image
add_note = False
if not question:
@@ -175,6 +176,7 @@ def visualizer(image_path: str, question: Optional[str] = None) -> str:
],
"max_tokens": 1000,
}
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
try:
output = response.json()["choices"][0]["message"]["content"]
diff --git a/examples/open_deep_research/visual_vs_text_browser.ipynb b/examples/open_deep_research/visual_vs_text_browser.ipynb
index 9bb4ee8dc..4a85a465a 100644
--- a/examples/open_deep_research/visual_vs_text_browser.ipynb
+++ b/examples/open_deep_research/visual_vs_text_browser.ipynb
@@ -102,7 +102,7 @@
"from smolagents import CodeAgent, LiteLLMModel\n",
"\n",
"\n",
- "proprietary_model = LiteLLMModel(\"gpt-4o\")"
+ "proprietary_model = LiteLLMModel(model_id=\"gpt-4o\")"
]
},
{
@@ -178,7 +178,7 @@
")\n",
"\n",
"\n",
- "proprietary_model = LiteLLMModel(\"gpt-4o\")\n",
+ "proprietary_model = LiteLLMModel(model_id=\"gpt-4o\")\n",
"vision_browser_agent = initialize_agent(proprietary_model)\n",
"### BUILD AGENTS & TOOLS\n",
"\n",
diff --git a/examples/rag.py b/examples/rag.py
index f5a2e2cb1..3ff572fb3 100644
--- a/examples/rag.py
+++ b/examples/rag.py
@@ -28,11 +28,11 @@
class RetrieverTool(Tool):
name = "retriever"
- description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
+ description = "Uses lexical search to retrieve the parts of transformers documentation that could be most relevant to answer your query."
inputs = {
"query": {
"type": "string",
- "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
+ "description": "The query to perform. This should be lexically close to your target documents. Use the affirmative form rather than a question.",
}
}
output_type = "string"
@@ -52,13 +52,13 @@ def forward(self, query: str) -> str:
)
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
retriever_tool = RetrieverTool(docs_processed)
agent = CodeAgent(
tools=[retriever_tool],
- model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"),
+ model=InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct"),
max_steps=4,
verbosity_level=2,
)
diff --git a/examples/rag_using_chromadb.py b/examples/rag_using_chromadb.py
index 864bfc848..fa2764355 100644
--- a/examples/rag_using_chromadb.py
+++ b/examples/rag_using_chromadb.py
@@ -97,8 +97,8 @@ def forward(self, query: str) -> str:
# Choose which LLM engine to use!
-# from smolagents import HfApiModel
-# model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
+# from smolagents import InferenceClientModel
+# model = InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct")
# from smolagents import TransformersModel
# model = TransformersModel(model_id="meta-llama/Llama-3.2-2B-Instruct")
diff --git a/examples/sandboxed_execution.py b/examples/sandboxed_execution.py
new file mode 100644
index 000000000..25e4fb771
--- /dev/null
+++ b/examples/sandboxed_execution.py
@@ -0,0 +1,12 @@
+from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel
+
+
+model = InferenceClientModel()
+
+agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model, executor_type="docker")
+output = agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")
+print("Docker executor result:", output)
+
+agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model, executor_type="e2b")
+output = agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?")
+print("E2B executor result:", output)
diff --git a/examples/smolagents_benchmark/run.py b/examples/smolagents_benchmark/run.py
new file mode 100644
index 000000000..f2b60eb58
--- /dev/null
+++ b/examples/smolagents_benchmark/run.py
@@ -0,0 +1,254 @@
+import argparse
+import datetime
+import json
+import os
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+import datasets
+import pandas as pd
+from dotenv import load_dotenv
+from tqdm import tqdm
+
+from smolagents import (
+ AgentError,
+ CodeAgent,
+ GoogleSearchTool,
+ InferenceClientModel,
+ LiteLLMModel,
+ PythonInterpreterTool,
+ ToolCallingAgent,
+ VisitWebpageTool,
+)
+from smolagents.agents import ActionStep
+
+
+load_dotenv()
+os.makedirs("output", exist_ok=True)
+
+APPEND_ANSWER_LOCK = threading.Lock()
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(description="Runs an agent powered by the given model on smolagent benchmark.")
+ parser.add_argument(
+ "--date",
+ type=str,
+ default=None,
+ help="The date for the evaluation.",
+ )
+ parser.add_argument(
+ "--eval-dataset",
+ type=str,
+ default="smolagents/benchmark-v1",
+ )
+ # The eval dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1
+ parser.add_argument(
+ "--model-type",
+ type=str,
+ default="InferenceClientModel",
+ choices=["LiteLLMModel", "InferenceClientModel"],
+ help="The model type to use (LiteLLMModel or InferenceClientModel)",
+ )
+ parser.add_argument(
+ "--model-id",
+ type=str,
+ required=True,
+ help="The model ID to use for the specified model type",
+ )
+ parser.add_argument(
+ "--provider",
+ type=str,
+ default="hf-inference",
+ help="The provider for InferenceClientModel - will not be used for LiteLLMModel",
+ )
+ parser.add_argument(
+ "--agent-action-type",
+ type=str,
+ default="code",
+ choices=["code", "tool-calling", "vanilla"],
+ help="The agent action type: 'code', 'tool-calling', or 'vanilla' to use the vanilla llm",
+ )
+ parser.add_argument(
+ "--parallel-workers",
+ type=int,
+ default=8,
+ help="The number of processes to run in parallel",
+ )
+ parser.add_argument(
+ "--push-answers-to-hub",
+ action="store_true",
+ default=False,
+ help="Push the answers to the hub",
+ )
+ parser.add_argument(
+ "--answers-dataset",
+ type=str,
+ default="smolagents/answers",
+ )
+ return parser.parse_args()
+
+
+def load_eval_dataset(eval_dataset):
+ # Choose the tasks to evaluate on:
+ # tasks = ["gaia"]
+ # or evaluate on all tasks: ["gaia", "math", "simpleqa"]
+ tasks = datasets.get_dataset_config_names(eval_dataset)
+ print(tasks)
+
+ eval_ds = {task: datasets.load_dataset(eval_dataset, task, split="test") for task in tasks}
+ print(pd.DataFrame(eval_ds["simpleqa"]).head())
+ return eval_ds
+
+
+def serialize_agent_error(obj):
+ if isinstance(obj, AgentError):
+ return {"error_type": obj.__class__.__name__, "message": obj.message}
+ else:
+ return str(obj)
+
+
+def append_answer(entry: dict, jsonl_file: str) -> None:
+ jsonl_file = Path(jsonl_file)
+ jsonl_file.parent.mkdir(parents=True, exist_ok=True)
+ with APPEND_ANSWER_LOCK, open(jsonl_file, "a", encoding="utf-8") as fp:
+ fp.write(json.dumps(entry) + "\n")
+ assert os.path.exists(jsonl_file), "File not found!"
+
+
+def answer_single_question(example, model, answers_file, action_type):
+ if action_type == "vanilla":
+ agent = model
+ elif action_type == "code":
+ agent = CodeAgent(
+ tools=[GoogleSearchTool(provider="serper"), VisitWebpageTool()],
+ model=model,
+ additional_authorized_imports=["numpy", "sympy"],
+ max_steps=10,
+ )
+ elif action_type == "tool-calling":
+ agent = ToolCallingAgent(
+ tools=[GoogleSearchTool(provider="serper"), VisitWebpageTool(), PythonInterpreterTool()],
+ model=model,
+ additional_authorized_imports=["numpy", "sympy"],
+ max_steps=10,
+ )
+
+ augmented_question = example["question"]
+ if example["source"] == "SimpleQA":
+ augmented_question += " Answer with only the final number."
+ if example["source"] == "MATH":
+ augmented_question += " Write code, not latex."
+
+ start_time = time.time()
+
+ try:
+ if action_type == "vanilla":
+ answer = agent([{"role": "user", "content": augmented_question}]).content
+ token_counts = agent.monitor.get_total_token_counts()
+ intermediate_steps = answer
+ else:
+ # Run agent ๐
+ answer = str(agent.run(augmented_question))
+ token_counts = agent.monitor.get_total_token_counts()
+ # Remove memory from logs to make them more compact.
+ for step in agent.memory.steps:
+ if isinstance(step, ActionStep):
+ step.agent_memory = None
+ intermediate_steps = str(agent.memory.steps)
+
+ end_time = time.time()
+ except Exception as e:
+ print("Error on ", augmented_question, e)
+ intermediate_steps = []
+ token_counts = {"input": 0, "output": 0}
+ answer = str(e)
+ end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ annotated_example = {
+ "model_id": model.model_id,
+ "agent_action_type": action_type,
+ "question": augmented_question,
+ "original_question": example["question"],
+ "answer": answer,
+ "true_answer": example["true_answer"],
+ "source": example["source"],
+ "intermediate_steps": intermediate_steps,
+ "start_time": start_time,
+ "end_time": end_time,
+ "token_counts": token_counts,
+ }
+ append_answer(annotated_example, answers_file)
+
+
+def answer_questions(
+ eval_ds,
+ model,
+ date,
+ action_type: str = "code",
+ output_dir: str = "output",
+ answers_dataset: str = None,
+ push_answers_to_hub: bool = False,
+ parallel_workers: int = 32,
+):
+ date = date or datetime.date.today().isoformat()
+ model_id = model.model_id
+
+ for task in eval_ds:
+ file_name = f"{output_dir}/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl"
+ print(f"Starting processing and writing output to '{file_name}'")
+ answered_questions = []
+ if os.path.exists(file_name):
+ with open(file_name, "r") as f:
+ for line in f:
+ answered_questions.append(json.loads(line)["original_question"])
+
+ examples_todo = [example for example in eval_ds[task] if example["question"] not in answered_questions]
+ print(f"Launching {parallel_workers} parallel workers.")
+
+ with ThreadPoolExecutor(max_workers=parallel_workers) as exe:
+ futures = [
+ exe.submit(answer_single_question, example, model, file_name, action_type) for example in examples_todo
+ ]
+ for f in tqdm(as_completed(futures), total=len(examples_todo), desc="Processing tasks"):
+ f.result()
+
+ print("All tasks processed.")
+
+ if push_answers_to_hub and answers_dataset:
+ print("Pushing answers to hub...")
+ ds = datasets.Dataset.from_pandas(pd.read_json(file_name, lines=True), split="test", preserve_index=False)
+ config = f"{model_id.replace('/', '__')}__{action_type}__{task}"
+ data_dir = f"{model_id}/{action_type}/{task}/{date}"
+ ds.push_to_hub(
+ answers_dataset,
+ config_name=config,
+ data_dir=data_dir,
+ split="test",
+ commit_message=f"Upload {config}",
+ )
+
+
+if __name__ == "__main__":
+ args = parse_arguments()
+
+ eval_ds = load_eval_dataset(args.eval_dataset)
+
+ if args.model_type == "LiteLLMModel":
+ model = LiteLLMModel(
+ model_id=args.model_id,
+ max_completion_tokens=8192,
+ )
+ else:
+ model = InferenceClientModel(model_id=args.model_id, provider=args.provider, max_tokens=8192)
+
+ answer_questions(
+ eval_ds,
+ model,
+ args.date,
+ action_type=args.agent_action_type,
+ answers_dataset=args.answers_dataset,
+ push_answers_to_hub=args.push_answers_to_hub,
+ parallel_workers=args.parallel_workers,
+ )
diff --git a/examples/smolagents_benchmark/score.ipynb b/examples/smolagents_benchmark/score.ipynb
new file mode 100644
index 000000000..b624d802c
--- /dev/null
+++ b/examples/smolagents_benchmark/score.ipynb
@@ -0,0 +1,392 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Benchmark date\n",
+ "# - set a concrete date:\n",
+ "DATE = \"2024-12-26\"\n",
+ "# - or use default: today\n",
+ "# DATE = None\n",
+ "\n",
+ "# Evaluation dataset\n",
+ "# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n",
+ "EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n",
+ "\n",
+ "# Answers dataset: it must be a gated dataset; required to score the answers\n",
+ "ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n",
+ "# Whether to push the answers dataset to the Hub\n",
+ "PUSH_ANSWERS_DATASET_TO_HUB = True\n",
+ "\n",
+ "# Results dataset\n",
+ "RESULTS_DATASET = \"smolagents-benchmark/results\"\n",
+ "# Whether to push the results dataset to the Hub\n",
+ "PUSH_RESULTS_DATASET_TO_HUB = True"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Constants and utilities/tools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "import string\n",
+ "import warnings\n",
+ "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
+ "from datetime import datetime\n",
+ "\n",
+ "import numpy as np\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "\n",
+ "def normalize_number_str(number_str: str) -> float:\n",
+ " # we replace these common units and commas to allow\n",
+ " # conversion to float\n",
+ " for char in [\"$\", \"%\", \",\"]:\n",
+ " number_str = number_str.replace(char, \"\")\n",
+ " try:\n",
+ " return float(number_str)\n",
+ " except ValueError:\n",
+ " return float(\"inf\")\n",
+ "\n",
+ "\n",
+ "def split_string(\n",
+ " s: str,\n",
+ " char_list: list[str] = [\",\", \";\"],\n",
+ ") -> list[str]:\n",
+ " pattern = f\"[{''.join(char_list)}]\"\n",
+ " return re.split(pattern, s)\n",
+ "\n",
+ "\n",
+ "def is_float(element: any) -> bool:\n",
+ " try:\n",
+ " float(element)\n",
+ " return True\n",
+ " except ValueError:\n",
+ " return False\n",
+ "\n",
+ "\n",
+ "def normalize_str(input_str, remove_punct=True) -> str:\n",
+ " \"\"\"\n",
+ " Normalize a string by:\n",
+ " - Removing all white spaces\n",
+ " - Optionally removing punctuation (if remove_punct is True)\n",
+ " - Converting to lowercase\n",
+ " Parameters:\n",
+ " - input_str: str, the string to normalize\n",
+ " - remove_punct: bool, whether to remove punctuation (default: True)\n",
+ " Returns:\n",
+ " - str, the normalized string\n",
+ " \"\"\"\n",
+ " # Remove all white spaces. Required e.g for seagull vs. sea gull\n",
+ " no_spaces = re.sub(r\"\\s\", \"\", input_str)\n",
+ "\n",
+ " # Remove punctuation, if specified.\n",
+ " if remove_punct:\n",
+ " translator = str.maketrans(\"\", \"\", string.punctuation)\n",
+ " return no_spaces.lower().translate(translator)\n",
+ " else:\n",
+ " return no_spaces.lower()\n",
+ "\n",
+ "\n",
+ "def extract_numbers(text: str) -> list[str]:\n",
+ " \"\"\"This pattern matches:\n",
+ " - Optional negative sign\n",
+ " - Numbers with optional comma thousand separators\n",
+ " - Optional decimal points with decimal numbers\n",
+ " \"\"\"\n",
+ " pattern = r\"-?(?:\\d{1,3}(?:,\\d{3})+|\\d+)(?:\\.\\d+)?\"\n",
+ "\n",
+ " return [el.replace(\",\", \"\") for el in re.findall(pattern, text)]\n",
+ "\n",
+ "\n",
+ "def get_question_score_gaia(\n",
+ " model_answer: str,\n",
+ " ground_truth: str,\n",
+ ") -> bool:\n",
+ " \"\"\"Scoring function used to score functions from the GAIA benchmark\"\"\"\n",
+ " if is_float(ground_truth):\n",
+ " normalized_answer = normalize_number_str(str(model_answer))\n",
+ " return normalized_answer == float(ground_truth)\n",
+ "\n",
+ " elif any(char in ground_truth for char in [\",\", \";\"]): # if gt is a list\n",
+ " # question with the fish: normalization removes punct\n",
+ " gt_elems = split_string(ground_truth)\n",
+ " ma_elems = split_string(model_answer)\n",
+ "\n",
+ " if len(gt_elems) != len(ma_elems): # check length is the same\n",
+ " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n",
+ " return False\n",
+ "\n",
+ " comparisons = []\n",
+ " for ma_elem, gt_elem in zip(ma_elems, gt_elems): # compare each element as float or str\n",
+ " if is_float(gt_elem):\n",
+ " normalized_ma_elem = normalize_number_str(ma_elem)\n",
+ " comparisons.append(normalized_ma_elem == float(gt_elem))\n",
+ " else:\n",
+ " # we do not remove punct since comparisons can include punct\n",
+ " comparisons.append(\n",
+ " normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)\n",
+ " )\n",
+ " return all(comparisons)\n",
+ "\n",
+ " else: # if gt is a str\n",
+ " return normalize_str(model_answer) == normalize_str(ground_truth)\n",
+ "\n",
+ "\n",
+ "def get_correct(row):\n",
+ " if row[\"source\"] == \"MATH\": # Checks the last number in answer\n",
+ " numbers_answer = extract_numbers(str(row[\"answer\"]))\n",
+ " if len(numbers_answer) == 0:\n",
+ " return False\n",
+ " return np.isclose(float(numbers_answer[-1]), float(row[\"true_answer\"]), rtol=1e-5, atol=1e-7)\n",
+ " else:\n",
+ " return get_question_score_gaia(str(row[\"answer\"]), str(row[\"true_answer\"]))\n",
+ "\n",
+ "\n",
+ "def score_answers_subset(answers_dataset, answers_subset):\n",
+ " try:\n",
+ " print(answers_dataset, answers_subset)\n",
+ " *model_id, action_type, task = answers_subset.split(\"__\")\n",
+ " model_id = \"/\".join(model_id)\n",
+ " ds = datasets.load_dataset(answers_dataset, answers_subset, split=\"test\")\n",
+ " df = ds.to_pandas()\n",
+ " df[\"correct\"] = df.apply(get_correct, axis=1)\n",
+ " assert df[\"correct\"].notnull().sum() > 30, \"Missing answers\"\n",
+ " acc = df[\"correct\"].mean().item()\n",
+ " result = df.loc[0, [\"model_id\", \"agent_action_type\", \"source\"]].to_dict()\n",
+ " result[\"acc\"] = acc\n",
+ " return result\n",
+ " except Exception as e:\n",
+ " print(f\"Error with {answers_subset}: {e}\")\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def score_answers(\n",
+ " answers_subsets,\n",
+ " answers_dataset=ANSWERS_DATASET,\n",
+ " date=DATE,\n",
+ " push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n",
+ " set_default=True,\n",
+ "):\n",
+ " if not answers_dataset:\n",
+ " raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n",
+ " date = date or datetime.date.today().isoformat()\n",
+ " results = []\n",
+ " with ThreadPoolExecutor(max_workers=16) as exe:\n",
+ " futures = [\n",
+ " exe.submit(score_answers_subset, answers_dataset, answers_subset) for answers_subset in answers_subsets\n",
+ " ]\n",
+ " for f in tqdm(as_completed(futures), total=len(answers_subsets), desc=\"Processing tasks\"):\n",
+ " result = f.result()\n",
+ " if result:\n",
+ " results.append(result)\n",
+ " df = pd.DataFrame(results)\n",
+ "\n",
+ " if push_to_hub_dataset:\n",
+ " ds = datasets.Dataset.from_pandas(df)\n",
+ " config = date\n",
+ " set_default = set_default\n",
+ " ds.push_to_hub(\n",
+ " push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n",
+ " )\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Score answers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import datasets\n",
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "# Choose the answers subsets to score:\n",
+ "# answers_subsets = [\"meta-llama__Llama-3.1-8B-Instruct__code__gaia\"]\n",
+ "# or get all the answers subsets present in the ANSWERS_DATASET\n",
+ "answers_subsets = datasets.get_dataset_config_names(ANSWERS_DATASET)\n",
+ "print(\"Number of answers_subsets\", len(answers_subsets))\n",
+ "print(\"Example of answers_subset\", answers_subsets[0])\n",
+ "\n",
+ "result_df = score_answers(answers_subsets)\n",
+ "result_df[\"acc\"] = (result_df[\"acc\"] * 100).round(2)\n",
+ "result_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pivot_df = result_df.pivot_table(\n",
+ " index=[\"model_id\", \"source\"],\n",
+ " columns=[\"agent_action_type\"],\n",
+ " values=\"acc\",\n",
+ " fill_value=float(\"nan\"),\n",
+ ").reset_index()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Display results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(pivot_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib.legend_handler import HandlerTuple # Added import\n",
+ "\n",
+ "\n",
+ "# Assuming pivot_df is your original dataframe\n",
+ "models = pivot_df[\"model_id\"].unique()\n",
+ "sources = pivot_df[\"source\"].unique()\n",
+ "\n",
+ "# Create figure and axis\n",
+ "plt.style.use(\"seaborn-v0_8-white\")\n",
+ "fig, ax = plt.subplots(figsize=(15, 6))\n",
+ "\n",
+ "# Set the width of each bar group and positions of the bars\n",
+ "width = 0.15 # width of each bar\n",
+ "spacing = 0.02 # space between bars within a group\n",
+ "group_spacing = 0.2 # space between model groups\n",
+ "\n",
+ "# Calculate positions for the bars\n",
+ "num_sources = len(sources)\n",
+ "total_width_per_group = (width + spacing) * num_sources * 2 # *2 for agent and vanilla\n",
+ "x = np.arange(len(models)) * (total_width_per_group + group_spacing)\n",
+ "\n",
+ "# Plot bars for each source\n",
+ "for i, source in enumerate(sources):\n",
+ " source_data = pivot_df[pivot_df[\"source\"] == source]\n",
+ " agent_scores = [\n",
+ " source_data[source_data[\"model_id\"] == model][\"code\"].values[0]\n",
+ " if len(source_data[source_data[\"model_id\"] == model]) > 0\n",
+ " else np.nan\n",
+ " for model in models\n",
+ " ]\n",
+ " vanilla_scores = [\n",
+ " source_data[source_data[\"model_id\"] == model][\"vanilla\"].values[0]\n",
+ " if len(source_data[source_data[\"model_id\"] == model]) > 0\n",
+ " else np.nan\n",
+ " for model in models\n",
+ " ]\n",
+ "\n",
+ " # Position calculation for each pair of bars\n",
+ " pos = x + i * (width * 2 + spacing)\n",
+ "\n",
+ " agent_bars = ax.bar(pos, agent_scores, width, label=f\"{source} (Agent)\", alpha=0.8)\n",
+ " vanilla_bars = ax.bar(\n",
+ " pos + width * 0.6,\n",
+ " vanilla_scores,\n",
+ " width,\n",
+ " hatch=\"////\",\n",
+ " alpha=0.5,\n",
+ " hatch_linewidth=2,\n",
+ " label=f\"{source} (Vanilla)\",\n",
+ " color=\"white\",\n",
+ " edgecolor=agent_bars[0].get_facecolor(),\n",
+ " )\n",
+ "\n",
+ "# Customize the plot\n",
+ "ax.set_ylabel(\"Score\")\n",
+ "ax.set_title(\"Model Performance Comparison\")\n",
+ "\n",
+ "# Set x-axis ticks in the middle of each group\n",
+ "group_centers = x + (total_width_per_group - spacing) / 2\n",
+ "ax.set_xticks(group_centers)\n",
+ "\n",
+ "# Wrap long model names to prevent overlap\n",
+ "wrapped_labels = [\"\\n\".join(model.split(\"/\")) for model in models]\n",
+ "ax.set_xticklabels(wrapped_labels, rotation=0, ha=\"center\")\n",
+ "\n",
+ "# Modify legend to combine agent and vanilla entries\n",
+ "handles, labels = ax.get_legend_handles_labels()\n",
+ "unique_sources = sources\n",
+ "legend_elements = [\n",
+ " (handles[i * 2], handles[i * 2 + 1], labels[i * 2].replace(\" (Agent)\", \"\")) for i in range(len(unique_sources))\n",
+ "]\n",
+ "custom_legend = ax.legend(\n",
+ " [(agent_handle, vanilla_handle) for agent_handle, vanilla_handle, _ in legend_elements],\n",
+ " [label for _, _, label in legend_elements],\n",
+ " handler_map={tuple: HandlerTuple(ndivide=None)},\n",
+ " bbox_to_anchor=(1.05, 1),\n",
+ " loc=\"upper left\",\n",
+ ")\n",
+ "\n",
+ "ax.yaxis.grid(True, linestyle=\"--\", alpha=0.3)\n",
+ "ax.set_ylim(bottom=0)\n",
+ "plt.tight_layout()\n",
+ "ax.spines[\"top\"].set_visible(False)\n",
+ "ax.spines[\"right\"].set_visible(False)\n",
+ "\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "test",
+ "language": "python",
+ "name": "test"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/text_to_sql.py b/examples/text_to_sql.py
index c25f0caa0..1b5bd3d6d 100644
--- a/examples/text_to_sql.py
+++ b/examples/text_to_sql.py
@@ -69,11 +69,11 @@ def sql_engine(query: str) -> str:
return output
-from smolagents import CodeAgent, HfApiModel
+from smolagents import CodeAgent, InferenceClientModel
agent = CodeAgent(
tools=[sql_engine],
- model=HfApiModel("meta-llama/Meta-Llama-3.1-8B-Instruct"),
+ model=InferenceClientModel(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"),
)
agent.run("Can you give me the name of the client who got the most expensive receipt?")
diff --git a/pyproject.toml b/pyproject.toml
index ab323f8a1..0db6ab2b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,10 +4,10 @@ build-backend = "setuptools.build_meta"
[project]
name = "smolagents"
-version = "1.10.0.dev0"
+version = "1.15.0.dev0"
description = "๐ค smolagents: a barebones library for agents. Agents write python code to call tools or orchestrate other agents."
authors = [
- { name="Aymeric Roucher", email="aymeric@hf.co" }, { name="Thomas Wolf"},
+ { name="Aymeric Roucher", email="aymeric@hf.co" },
]
readme = "README.md"
requires-python = ">=3.10"
@@ -15,7 +15,6 @@ dependencies = [
"huggingface-hub>=0.28.0",
"requests>=2.32.3",
"rich>=13.9.4",
- "pandas>=2.2.3",
"jinja2>=3.1.4",
"pillow>=11.0.0",
"markdownify>=0.14.1",
@@ -24,14 +23,22 @@ dependencies = [
]
[project.optional-dependencies]
+bedrock = [
+ "boto3>=1.36.18"
+]
torch = [
"torch",
"torchvision",
+ "numpy>=1.21.2",
]
audio = [
"soundfile",
"smolagents[torch]",
]
+docker = [
+ "docker>=7.1.0",
+ "websocket-client",
+]
e2b = [
"e2b-code-interpreter>=1.0.3",
"python-dotenv>=1.0.1",
@@ -43,7 +50,7 @@ litellm = [
"litellm>=1.60.2",
]
mcp = [
- "mcpadapt>=0.0.6",
+ "mcpadapt>=0.0.19", # Security fix
"mcp",
]
mlx-lm = [
@@ -60,21 +67,32 @@ telemetry = [
]
transformers = [
"accelerate",
- "transformers>=4.0.0,<4.49.0",
+ "transformers>=4.0.0",
"smolagents[torch]",
]
+vision = [
+ "helium",
+ "selenium",
+]
+vllm = [
+ "vllm",
+ "torch"
+]
all = [
- "smolagents[audio,e2b,gradio,litellm,mcp,openai,telemetry,transformers]",
+ "smolagents[audio,docker,e2b,gradio,litellm,mcp,mlx-lm,openai,telemetry,transformers,vision,bedrock]",
]
quality = [
"ruff>=0.9.0",
]
test = [
"ipython>=8.31.0", # for interactive environment tests
+ "pandas>=2.2.3",
"pytest>=8.1.0",
+ "pytest-datadir",
"python-dotenv>=1.0.1", # For test_all_docs
"smolagents[all]",
"rank-bm25", # For test_all_docs
+ "Wikipedia-API>=0.8.1",
]
dev = [
"smolagents[quality,test]",
@@ -107,4 +125,4 @@ lines-after-imports = 2
[project.scripts]
smolagent = "smolagents.cli:main"
-webagent = "smolagents.vision_web_browser:main"
\ No newline at end of file
+webagent = "smolagents.vision_web_browser:main"
diff --git a/src/smolagents/__init__.py b/src/smolagents/__init__.py
index a1321eb1b..be4c3c19e 100644
--- a/src/smolagents/__init__.py
+++ b/src/smolagents/__init__.py
@@ -14,17 +14,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "1.10.0.dev0"
+__version__ = "1.15.0.dev0"
from .agent_types import * # noqa: I001
from .agents import * # Above noqa avoids a circular dependency due to cli.py
from .default_tools import *
-from .e2b_executor import *
from .gradio_ui import *
from .local_python_executor import *
+from .mcp_client import *
from .memory import *
from .models import *
from .monitoring import *
+from .remote_executors import *
from .tools import *
from .utils import *
from .cli import *
diff --git a/src/smolagents/_function_type_hints_utils.py b/src/smolagents/_function_type_hints_utils.py
index dddd90d0c..e5a367c08 100644
--- a/src/smolagents/_function_type_hints_utils.py
+++ b/src/smolagents/_function_type_hints_utils.py
@@ -26,26 +26,19 @@
import json
import re
import types
+from collections.abc import Callable
from copy import copy
from typing import (
Any,
- Callable,
- Dict,
- List,
- Optional,
- Tuple,
+ Literal,
Union,
get_args,
get_origin,
get_type_hints,
)
-from huggingface_hub.utils import is_torch_available
-from .utils import _is_pillow_available
-
-
-def get_imports(code: str) -> List[str]:
+def get_imports(code: str) -> list[str]:
"""
Extracts all the libraries (not relative imports) that are imported in a code.
@@ -83,7 +76,7 @@ class DocstringParsingException(Exception):
"""Exception raised for errors in parsing docstrings to generate JSON schemas"""
-def get_json_schema(func: Callable) -> Dict:
+def get_json_schema(func: Callable) -> dict:
"""
This function generates a JSON schema for a given function, based on its docstring and type hints. This is
mostly used for passing lists of tools to a chat template. The JSON schema contains the name and description of
@@ -221,26 +214,30 @@ def get_json_schema(func: Callable) -> Dict:
# Extracts the initial segment of the docstring, containing the function description
-description_re = re.compile(r"^(.*?)[\n\s]*(Args:|Returns:|Raises:|\Z)", re.DOTALL)
+description_re = re.compile(r"^(.*?)(?=\n\s*(Args:|Returns:|Raises:)|\Z)", re.DOTALL)
# Extracts the Args: block from the docstring
args_re = re.compile(r"\n\s*Args:\n\s*(.*?)[\n\s]*(Returns:|Raises:|\Z)", re.DOTALL)
# Splits the Args: block into individual arguments
args_split_re = re.compile(
- r"""
-(?:^|\n) # Match the start of the args block, or a newline
-\s*(\w+)\s*(?:\([^)]*\))?:\s* # Capture the argument name (ignore the type) and strip spacing
-(.*?)\s* # Capture the argument description, which can span multiple lines, and strip trailing spacing
-(?=\n\s*\w+:|\Z) # Stop when you hit the next argument or the end of the block
-""",
+ r"(?:^|\n)" # Match the start of the args block, or a newline
+ r"\s*(\w+)\s*(?:\([^)]*?\))?:\s*" # Capture the argument name (ignore the type) and strip spacing
+ r"(.*?)\s*" # Capture the argument description, which can span multiple lines, and strip trailing spacing
+ r"(?=\n\s*\w+\s*(?:\([^)]*?\))?:|\Z)", # Stop when you hit the next argument (with or without type) or the end of the block
re.DOTALL | re.VERBOSE,
)
# Extracts the Returns: block from the docstring, if present. Note that most chat templates ignore the return type/doc!
-returns_re = re.compile(r"\n\s*Returns:\n\s*(.*?)[\n\s]*(Raises:|\Z)", re.DOTALL)
+returns_re = re.compile(
+ r"\n\s*Returns:\n\s*"
+ r"(?:[^)]*?:\s*)?" # Ignore the return type if present
+ r"(.*?)" # Capture the return description
+ r"[\n\s]*(Raises:|\Z)",
+ re.DOTALL,
+)
def _parse_google_format_docstring(
docstring: str,
-) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
+) -> tuple[str | None, dict | None, str | None]:
"""
Parses a Google-style docstring to extract the function description,
argument descriptions, and return description.
@@ -273,7 +270,7 @@ def _parse_google_format_docstring(
return description, args_dict, returns
-def _convert_type_hints_to_json_schema(func: Callable, error_on_missing_type_hints: bool = True) -> Dict:
+def _convert_type_hints_to_json_schema(func: Callable, error_on_missing_type_hints: bool = True) -> dict:
type_hints = get_type_hints(func)
signature = inspect.signature(func)
@@ -300,7 +297,7 @@ def _convert_type_hints_to_json_schema(func: Callable, error_on_missing_type_hin
return schema
-def _parse_type_hint(hint: str) -> Dict:
+def _parse_type_hint(hint: str) -> dict:
origin = get_origin(hint)
args = get_args(hint)
@@ -314,20 +311,7 @@ def _parse_type_hint(hint: str) -> Dict:
)
elif origin is Union or (hasattr(types, "UnionType") and origin is types.UnionType):
- # Recurse into each of the subtypes in the Union, except None, which is handled separately at the end
- subtypes = [_parse_type_hint(t) for t in args if t is not type(None)]
- if len(subtypes) == 1:
- # A single non-null type can be expressed directly
- return_dict = subtypes[0]
- elif all(isinstance(subtype["type"], str) for subtype in subtypes):
- # A union of basic types can be expressed as a list in the schema
- return_dict = {"type": sorted([subtype["type"] for subtype in subtypes])}
- else:
- # A union of more complex types requires "anyOf"
- return_dict = {"anyOf": subtypes}
- if type(None) in args:
- return_dict["nullable"] = True
- return return_dict
+ return _parse_union_type(args)
elif origin is list:
if not args:
@@ -363,9 +347,33 @@ def _parse_type_hint(hint: str) -> Dict:
out["additionalProperties"] = _parse_type_hint(args[1])
return out
+ elif origin is Literal:
+ literal_types = set(type(arg) for arg in args)
+ final_type = _parse_union_type(literal_types)
+
+ # None literal value is represented by 'nullable' field set by _parse_union_type
+ final_type.update({"enum": [arg for arg in args if arg is not None]})
+ return final_type
+
raise TypeHintParsingException("Couldn't parse this type hint, likely due to a custom class or object: ", hint)
+def _parse_union_type(args: tuple[Any, ...]) -> dict:
+ subtypes = [_parse_type_hint(t) for t in args if t is not type(None)]
+ if len(subtypes) == 1:
+ # A single non-null type can be expressed directly
+ return_dict = subtypes[0]
+ elif all(isinstance(subtype["type"], str) for subtype in subtypes):
+ # A union of basic types can be expressed as a list in the schema
+ return_dict = {"type": sorted([subtype["type"] for subtype in subtypes])}
+ else:
+ # A union of more complex types requires "anyOf"
+ return_dict = {"anyOf": subtypes}
+ if type(None) in args:
+ return_dict["nullable"] = True
+ return return_dict
+
+
_BASE_TYPE_MAPPING = {
int: {"type": "integer"},
float: {"type": "number"},
@@ -376,17 +384,20 @@ def _parse_type_hint(hint: str) -> Dict:
}
-def _get_json_schema_type(param_type: str) -> Dict[str, str]:
+def _get_json_schema_type(param_type: str) -> dict[str, str]:
if param_type in _BASE_TYPE_MAPPING:
return copy(_BASE_TYPE_MAPPING[param_type])
- if str(param_type) == "Image" and _is_pillow_available():
+ if str(param_type) == "Image":
from PIL.Image import Image
if param_type == Image:
return {"type": "image"}
- if str(param_type) == "Tensor" and is_torch_available():
- from torch import Tensor
+ if str(param_type) == "Tensor":
+ try:
+ from torch import Tensor
- if param_type == Tensor:
- return {"type": "audio"}
+ if param_type == Tensor:
+ return {"type": "audio"}
+ except ModuleNotFoundError:
+ pass
return {"type": "object"}
diff --git a/src/smolagents/agent_types.py b/src/smolagents/agent_types.py
index b0d4ee1d1..73772292e 100644
--- a/src/smolagents/agent_types.py
+++ b/src/smolagents/agent_types.py
@@ -19,11 +19,8 @@
import uuid
from io import BytesIO
-import numpy as np
+import PIL.Image
import requests
-from huggingface_hub.utils import is_torch_available
-from PIL import Image
-from PIL.Image import Image as ImageType
from .utils import _is_package_available
@@ -37,7 +34,7 @@ class AgentType:
These objects serve three purposes:
- - They behave as they were the type they're meant to be, e.g., a string for text, a PIL.Image for images
+ - They behave as they were the type they're meant to be, e.g., a string for text, a PIL.Image.Image for images
- They can be stringified: str(object) in order to return a string defining the object
- They should be displayed correctly in ipython notebooks/colab/jupyter
"""
@@ -73,14 +70,14 @@ def to_string(self):
return str(self._value)
-class AgentImage(AgentType, ImageType):
+class AgentImage(AgentType, PIL.Image.Image):
"""
- Image type returned by the agent. Behaves as a PIL.Image.
+ Image type returned by the agent. Behaves as a PIL.Image.Image.
"""
def __init__(self, value):
AgentType.__init__(self, value)
- ImageType.__init__(self)
+ PIL.Image.Image.__init__(self)
self._path = None
self._raw = None
@@ -88,19 +85,24 @@ def __init__(self, value):
if isinstance(value, AgentImage):
self._raw, self._path, self._tensor = value._raw, value._path, value._tensor
- elif isinstance(value, ImageType):
+ elif isinstance(value, PIL.Image.Image):
self._raw = value
elif isinstance(value, bytes):
- self._raw = Image.open(BytesIO(value))
+ self._raw = PIL.Image.open(BytesIO(value))
elif isinstance(value, (str, pathlib.Path)):
self._path = value
- elif is_torch_available():
- import torch
+ else:
+ try:
+ import torch
+
+ if isinstance(value, torch.Tensor):
+ self._tensor = value
+ import numpy as np
- if isinstance(value, torch.Tensor):
- self._tensor = value
- if isinstance(value, np.ndarray):
- self._tensor = torch.from_numpy(value)
+ if isinstance(value, np.ndarray):
+ self._tensor = torch.from_numpy(value)
+ except ModuleNotFoundError:
+ pass
if self._path is None and self._raw is None and self._tensor is None:
raise TypeError(f"Unsupported type for {self.__class__.__name__}: {type(value)}")
@@ -115,18 +117,20 @@ def _ipython_display_(self, include=None, exclude=None):
def to_raw(self):
"""
- Returns the "raw" version of that object. In the case of an AgentImage, it is a PIL.Image.
+ Returns the "raw" version of that object. In the case of an AgentImage, it is a PIL.Image.Image.
"""
if self._raw is not None:
return self._raw
if self._path is not None:
- self._raw = Image.open(self._path)
+ self._raw = PIL.Image.open(self._path)
return self._raw
if self._tensor is not None:
+ import numpy as np
+
array = self._tensor.cpu().detach().numpy()
- return Image.fromarray((255 - array * 255).astype(np.uint8))
+ return PIL.Image.fromarray((255 - array * 255).astype(np.uint8))
def to_string(self):
"""
@@ -143,10 +147,12 @@ def to_string(self):
return self._path
if self._tensor is not None:
+ import numpy as np
+
array = self._tensor.cpu().detach().numpy()
# There is likely simpler than load into image into save
- img = Image.fromarray((255 - array * 255).astype(np.uint8))
+ img = PIL.Image.fromarray((255 - array * 255).astype(np.uint8))
directory = tempfile.mkdtemp()
self._path = os.path.join(directory, str(uuid.uuid4()) + ".png")
@@ -172,10 +178,11 @@ class AgentAudio(AgentType, str):
"""
def __init__(self, value, samplerate=16_000):
- if not _is_package_available("soundfile") or not is_torch_available():
+ if not _is_package_available("soundfile") or not _is_package_available("torch"):
raise ModuleNotFoundError(
"Please install 'audio' extra to use AgentAudio: `pip install 'smolagents[audio]'`"
)
+ import numpy as np
import torch
super().__init__(value)
@@ -186,7 +193,7 @@ def __init__(self, value, samplerate=16_000):
self.samplerate = samplerate
if isinstance(value, (str, pathlib.Path)):
self._path = value
- elif is_torch_available() and isinstance(value, torch.Tensor):
+ elif isinstance(value, torch.Tensor):
self._tensor = value
elif isinstance(value, tuple):
self.samplerate = value[0]
@@ -261,13 +268,15 @@ def handle_agent_output_types(output, output_type=None):
# If the class does not have defined output, then we map according to the type
if isinstance(output, str):
return AgentText(output)
- if isinstance(output, ImageType):
+ if isinstance(output, PIL.Image.Image):
return AgentImage(output)
- if is_torch_available():
+ try:
import torch
if isinstance(output, torch.Tensor):
return AgentAudio(output)
+ except ModuleNotFoundError:
+ pass
return output
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
index a4d1b08f8..8ff8eb230 100644
--- a/src/smolagents/agents.py
+++ b/src/smolagents/agents.py
@@ -22,40 +22,49 @@
import tempfile
import textwrap
import time
+from abc import ABC, abstractmethod
from collections import deque
+from collections.abc import Callable, Generator
from logging import getLogger
from pathlib import Path
-from typing import Any, Callable, Dict, Generator, List, Optional, Set, Tuple, TypedDict, Union
+from typing import TYPE_CHECKING, Any, TypedDict
import jinja2
import yaml
from huggingface_hub import create_repo, metadata_update, snapshot_download, upload_folder
from jinja2 import StrictUndefined, Template
from rich.console import Group
+from rich.live import Live
+from rich.markdown import Markdown
from rich.panel import Panel
from rich.rule import Rule
from rich.text import Text
-from .agent_types import AgentAudio, AgentImage, AgentType, handle_agent_output_types
+
+if TYPE_CHECKING:
+ import PIL.Image
+
+from .agent_types import AgentAudio, AgentImage, handle_agent_output_types
from .default_tools import TOOL_MAPPING, FinalAnswerTool
-from .e2b_executor import E2BExecutor
-from .local_python_executor import (
- BASE_BUILTIN_MODULES,
- LocalPythonInterpreter,
- fix_final_answer_code,
-)
-from .memory import ActionStep, AgentMemory, PlanningStep, SystemPromptStep, TaskStep, ToolCall
-from .models import (
- ChatMessage,
- MessageRole,
- Model,
+from .local_python_executor import BASE_BUILTIN_MODULES, LocalPythonExecutor, PythonExecutor, fix_final_answer_code
+from .memory import (
+ ActionStep,
+ AgentMemory,
+ FinalAnswerStep,
+ Message,
+ PlanningStep,
+ SystemPromptStep,
+ TaskStep,
+ ToolCall,
)
+from .models import ChatMessage, MessageRole, Model, parse_json_if_needed
from .monitoring import (
YELLOW_HEX,
AgentLogger,
LogLevel,
Monitor,
)
+from .remote_executors import DockerExecutor, E2BExecutor
from .tools import Tool
from .utils import (
AgentError,
@@ -63,9 +72,11 @@
AgentGenerationError,
AgentMaxStepsError,
AgentParsingError,
+ AgentToolCallError,
+ AgentToolExecutionError,
+ is_valid_name,
make_init_file,
parse_code_blobs,
- parse_json_tool_call,
truncate_content,
)
@@ -73,12 +84,12 @@
logger = getLogger(__name__)
-def get_variable_names(self, template: str) -> Set[str]:
+def get_variable_names(self, template: str) -> set[str]:
pattern = re.compile(r"\{\{([^{}]+)\}\}")
return {match.group(1).strip() for match in pattern.finditer(template)}
-def populate_template(template: str, variables: Dict[str, Any]) -> str:
+def populate_template(template: str, variables: dict[str, Any]) -> str:
compiled_template = Template(template, undefined=StrictUndefined)
try:
return compiled_template.render(**variables)
@@ -91,18 +102,12 @@ class PlanningPromptTemplate(TypedDict):
Prompt templates for the planning step.
Args:
- initial_facts (`str`): Initial facts prompt.
- initial_plan (`str`): Initial plan prompt.
- update_facts_pre_messages (`str`): Update facts pre-messages prompt.
- update_facts_post_messages (`str`): Update facts post-messages prompt.
+ plan (`str`): Initial plan prompt.
update_plan_pre_messages (`str`): Update plan pre-messages prompt.
update_plan_post_messages (`str`): Update plan post-messages prompt.
"""
- initial_facts: str
initial_plan: str
- update_facts_pre_messages: str
- update_facts_post_messages: str
update_plan_pre_messages: str
update_plan_post_messages: str
@@ -153,10 +158,7 @@ class PromptTemplates(TypedDict):
EMPTY_PROMPT_TEMPLATES = PromptTemplates(
system_prompt="",
planning=PlanningPromptTemplate(
- initial_facts="",
initial_plan="",
- update_facts_pre_messages="",
- update_facts_post_messages="",
update_plan_pre_messages="",
update_plan_post_messages="",
),
@@ -165,7 +167,7 @@ class PromptTemplates(TypedDict):
)
-class MultiStepAgent:
+class MultiStepAgent(ABC):
"""
Agent class that solves the given task step by step, using the ReAct framework:
While the objective is not reached, the agent will perform a cycle of action (given by the LLM) and observation (obtained from the environment).
@@ -174,7 +176,7 @@ class MultiStepAgent:
tools (`list[Tool]`): [`Tool`]s that the agent can use.
model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates.
- max_steps (`int`, default `6`): Maximum number of steps the agent can take to solve the task.
+ max_steps (`int`, default `20`): Maximum number of steps the agent can take to solve the task.
tool_parser (`Callable`, *optional*): Function used to parse the tool calls from the LLM output.
add_base_tools (`bool`, default `False`): Whether to add the base tools to the agent's tools.
verbosity_level (`LogLevel`, default `LogLevel.INFO`): Level of verbosity of the agent's logs.
@@ -190,32 +192,43 @@ class MultiStepAgent:
def __init__(
self,
- tools: List[Tool],
- model: Callable[[List[Dict[str, str]]], ChatMessage],
- prompt_templates: Optional[PromptTemplates] = None,
- max_steps: int = 6,
- tool_parser: Optional[Callable] = None,
+ tools: list[Tool],
+ model: Model,
+ prompt_templates: PromptTemplates | None = None,
+ max_steps: int = 20,
add_base_tools: bool = False,
verbosity_level: LogLevel = LogLevel.INFO,
- grammar: Optional[Dict[str, str]] = None,
- managed_agents: Optional[List] = None,
- step_callbacks: Optional[List[Callable]] = None,
- planning_interval: Optional[int] = None,
- name: Optional[str] = None,
- description: Optional[str] = None,
+ grammar: dict[str, str] | None = None,
+ managed_agents: list | None = None,
+ step_callbacks: list[Callable] | None = None,
+ planning_interval: int | None = None,
+ name: str | None = None,
+ description: str | None = None,
provide_run_summary: bool = False,
- final_answer_checks: Optional[List[Callable]] = None,
+ final_answer_checks: list[Callable] | None = None,
+ logger: AgentLogger | None = None,
):
self.agent_name = self.__class__.__name__
self.model = model
self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES
+ if prompt_templates is not None:
+ missing_keys = set(EMPTY_PROMPT_TEMPLATES.keys()) - set(prompt_templates.keys())
+ assert not missing_keys, (
+ f"Some prompt templates are missing from your custom `prompt_templates`: {missing_keys}"
+ )
+ for key, value in EMPTY_PROMPT_TEMPLATES.items():
+ if isinstance(value, dict):
+ for subkey in value.keys():
+ assert key in prompt_templates.keys() and (subkey in prompt_templates[key].keys()), (
+ f"Some prompt templates are missing from your custom `prompt_templates`: {subkey} under {key}"
+ )
+
self.max_steps = max_steps
self.step_number = 0
- self.tool_parser = tool_parser or parse_json_tool_call
self.grammar = grammar
self.planning_interval = planning_interval
- self.state = {}
- self.name = name
+ self.state: dict[str, Any] = {}
+ self.name = self._validate_name(name)
self.description = description
self.provide_run_summary = provide_run_summary
self.final_answer_checks = final_answer_checks
@@ -225,15 +238,25 @@ def __init__(
self._validate_tools_and_managed_agents(tools, managed_agents)
self.system_prompt = self.initialize_system_prompt()
- self.input_messages = None
- self.task = None
+ self.task: str | None = None
self.memory = AgentMemory(self.system_prompt)
- self.logger = AgentLogger(level=verbosity_level)
+
+ if logger is None:
+ self.logger = AgentLogger(level=verbosity_level)
+ else:
+ self.logger = logger
+
self.monitor = Monitor(self.model, self.logger)
self.step_callbacks = step_callbacks if step_callbacks is not None else []
self.step_callbacks.append(self.monitor.update_metrics)
- def _setup_managed_agents(self, managed_agents):
+ def _validate_name(self, name: str | None) -> str | None:
+ if name is not None and not is_valid_name(name):
+ raise ValueError(f"Agent name '{name}' must be a valid Python identifier and not a reserved keyword.")
+ return name
+
+ def _setup_managed_agents(self, managed_agents: list | None = None) -> None:
+ """Setup managed agents with proper logging."""
self.managed_agents = {}
if managed_agents:
assert all(agent.name and agent.description for agent in managed_agents), (
@@ -252,16 +275,14 @@ def _setup_tools(self, tools, add_base_tools):
if name != "python_interpreter" or self.__class__.__name__ == "ToolCallingAgent"
}
)
- self.tools["final_answer"] = FinalAnswerTool()
+ self.tools.setdefault("final_answer", FinalAnswerTool())
def _validate_tools_and_managed_agents(self, tools, managed_agents):
tool_and_managed_agent_names = [tool.name for tool in tools]
if managed_agents is not None:
- for agent in managed_agents:
- tool_and_managed_agent_names.append(agent.name)
- for tool in agent.tools.values():
- if tool.name != "final_answer":
- tool_and_managed_agent_names.append(tool.name)
+ tool_and_managed_agent_names += [agent.name for agent in managed_agents]
+ if self.name:
+ tool_and_managed_agent_names.append(self.name)
if len(tool_and_managed_agent_names) != len(set(tool_and_managed_agent_names)):
raise ValueError(
"Each tool or managed_agent should have a unique name! You passed these duplicate names: "
@@ -273,18 +294,22 @@ def run(
task: str,
stream: bool = False,
reset: bool = True,
- images: Optional[List[str]] = None,
- additional_args: Optional[Dict] = None,
+ images: list["PIL.Image.Image"] | None = None,
+ additional_args: dict | None = None,
+ max_steps: int | None = None,
):
"""
Run the agent for the given task.
Args:
task (`str`): Task to perform.
- stream (`bool`): Whether to run in a streaming way.
+ stream (`bool`): Whether to run in streaming mode.
+ If `True`, returns a generator that yields each step as it is executed. You must iterate over this generator to process the individual steps (e.g., using a for loop or `next()`).
+ If `False`, executes all steps internally and returns only the final answer after completion.
reset (`bool`): Whether to reset the conversation or keep it going from previous run.
- images (`list[str]`, *optional*): Paths to image(s).
- additional_args (`dict`): Any other variables that you want to pass to the agent run, for instance images or dataframes. Give them clear names!
+ images (`list[PIL.Image.Image]`, *optional*): Image(s) objects.
+ additional_args (`dict`, *optional*): Any other variables that you want to pass to the agent run, for instance images or dataframes. Give them clear names!
+ max_steps (`int`, *optional*): Maximum number of steps the agent can take to solve the task. if not provided, will use the agent's default value.
Example:
```py
@@ -293,8 +318,9 @@ def run(
agent.run("What is the result of 2 power 3.7384?")
```
"""
-
+ max_steps = max_steps or self.max_steps
self.task = task
+ self.interrupt_switch = False
if additional_args is not None:
self.state.update(additional_args)
self.task += f"""
@@ -313,41 +339,58 @@ def run(
level=LogLevel.INFO,
title=self.name if hasattr(self, "name") else None,
)
-
self.memory.steps.append(TaskStep(task=self.task, task_images=images))
+ if getattr(self, "python_executor", None):
+ self.python_executor.send_variables(variables=self.state)
+ self.python_executor.send_tools({**self.tools, **self.managed_agents})
+
if stream:
# The steps are returned as they are executed through a generator to iterate on.
- return self._run(task=self.task, images=images)
- # Outputs are returned only at the end as a string. We only look at the last step
- return deque(self._run(task=self.task, images=images), maxlen=1)[0]
+ return self._run(task=self.task, max_steps=max_steps, images=images)
+ # Outputs are returned only at the end. We only look at the last step.
+ return deque(self._run(task=self.task, max_steps=max_steps, images=images), maxlen=1)[0].final_answer
- def _run(self, task: str, images: List[str] | None = None) -> Generator[ActionStep | AgentType, None, None]:
+ def _run(
+ self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None
+ ) -> Generator[ActionStep | PlanningStep | FinalAnswerStep]:
final_answer = None
self.step_number = 1
- while final_answer is None and self.step_number <= self.max_steps:
+ while final_answer is None and self.step_number <= max_steps:
+ if self.interrupt_switch:
+ raise AgentError("Agent interrupted.", self.logger)
step_start_time = time.time()
- memory_step = self._create_memory_step(step_start_time, images)
+ if self.planning_interval is not None and (
+ self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0
+ ):
+ planning_step = self._generate_planning_step(
+ task, is_first_step=(self.step_number == 1), step=self.step_number
+ )
+ self.memory.steps.append(planning_step)
+ yield planning_step
+ action_step = ActionStep(
+ step_number=self.step_number, start_time=step_start_time, observations_images=images
+ )
try:
- final_answer = self._execute_step(task, memory_step)
+ final_answer = self._execute_step(task, action_step)
+ except AgentGenerationError as e:
+ # Agent generation errors are not caused by a Model error but an implementation error: so we should raise them and exit.
+ raise e
except AgentError as e:
- memory_step.error = e
+ # Other AgentError types are caused by the Model, so we should log them and iterate.
+ action_step.error = e
finally:
- self._finalize_step(memory_step, step_start_time)
- yield memory_step
+ self._finalize_step(action_step, step_start_time)
+ self.memory.steps.append(action_step)
+ yield action_step
self.step_number += 1
- if final_answer is None and self.step_number == self.max_steps + 1:
+ if final_answer is None and self.step_number == max_steps + 1:
final_answer = self._handle_max_steps_reached(task, images, step_start_time)
- yield memory_step
- yield handle_agent_output_types(final_answer)
-
- def _create_memory_step(self, step_start_time: float, images: List[str] | None) -> ActionStep:
- return ActionStep(step_number=self.step_number, start_time=step_start_time, observations_images=images)
+ yield action_step
+ yield FinalAnswerStep(handle_agent_output_types(final_answer))
- def _execute_step(self, task: str, memory_step: ActionStep) -> Union[None, Any]:
- if self.planning_interval is not None and self.step_number % self.planning_interval == 1:
- self.planning_step(task, is_first_step=(self.step_number == 1), step=self.step_number)
+ def _execute_step(self, task: str, memory_step: ActionStep) -> None | Any:
self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO)
final_answer = self.step(memory_step)
if final_answer is not None and self.final_answer_checks:
@@ -364,14 +407,13 @@ def _validate_final_answer(self, final_answer: Any):
def _finalize_step(self, memory_step: ActionStep, step_start_time: float):
memory_step.end_time = time.time()
memory_step.duration = memory_step.end_time - step_start_time
- self.memory.steps.append(memory_step)
for callback in self.step_callbacks:
# For compatibility with old callbacks that don't take the agent as an argument
callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback(
memory_step, agent=self
)
- def _handle_max_steps_reached(self, task: str, images: List[str], step_start_time: float) -> Any:
+ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"], step_start_time: float) -> Any:
final_answer = self.provide_final_answer(task, images)
final_memory_step = ActionStep(
step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger)
@@ -386,124 +428,70 @@ def _handle_max_steps_reached(self, task: str, images: List[str], step_start_tim
)
return final_answer
- def planning_step(self, task, is_first_step: bool, step: int) -> None:
- input_messages, facts_message, plan_message = (
- self._generate_initial_plan(task) if is_first_step else self._generate_updated_plan(task, step)
- )
- self._record_planning_step(input_messages, facts_message, plan_message, is_first_step)
-
- def _generate_initial_plan(self, task: str) -> Tuple[ChatMessage, ChatMessage]:
- input_messages = [
- {
+ def _generate_planning_step(self, task, is_first_step: bool, step: int) -> PlanningStep:
+ if is_first_step:
+ input_messages = [
+ {
+ "role": MessageRole.USER,
+ "content": [
+ {
+ "type": "text",
+ "text": populate_template(
+ self.prompt_templates["planning"]["initial_plan"],
+ variables={"task": task, "tools": self.tools, "managed_agents": self.managed_agents},
+ ),
+ }
+ ],
+ }
+ ]
+ plan_message = self.model(input_messages, stop_sequences=[""])
+ plan = textwrap.dedent(
+ f"""Here are the facts I know and the plan of action that I will follow to solve the task:\n```\n{plan_message.content}\n```"""
+ )
+ else:
+ # Summary mode removes the system prompt and previous planning messages output by the model.
+ # Removing previous planning messages avoids influencing too much the new plan.
+ memory_messages = self.write_memory_to_messages(summary_mode=True)
+ plan_update_pre = {
+ "role": MessageRole.SYSTEM,
+ "content": [
+ {
+ "type": "text",
+ "text": populate_template(
+ self.prompt_templates["planning"]["update_plan_pre_messages"], variables={"task": task}
+ ),
+ }
+ ],
+ }
+ plan_update_post = {
"role": MessageRole.USER,
"content": [
{
"type": "text",
"text": populate_template(
- self.prompt_templates["planning"]["initial_facts"], variables={"task": task}
+ self.prompt_templates["planning"]["update_plan_post_messages"],
+ variables={
+ "task": task,
+ "tools": self.tools,
+ "managed_agents": self.managed_agents,
+ "remaining_steps": (self.max_steps - step),
+ },
),
}
],
- },
- ]
- facts_message = self.model(input_messages)
-
- message_prompt_plan = {
- "role": MessageRole.USER,
- "content": [
- {
- "type": "text",
- "text": populate_template(
- self.prompt_templates["planning"]["initial_plan"],
- variables={
- "task": task,
- "tools": self.tools,
- "managed_agents": self.managed_agents,
- "answer_facts": facts_message.content,
- },
- ),
- }
- ],
- }
- plan_message = self.model([message_prompt_plan], stop_sequences=[""])
- return input_messages, facts_message, plan_message
-
- def _generate_updated_plan(self, task: str, step: int) -> Tuple[ChatMessage, ChatMessage]:
- # Do not take the system prompt message from the memory
- # summary_mode=False: Do not take previous plan steps to avoid influencing the new plan
- memory_messages = self.write_memory_to_messages()[1:]
- facts_update_pre = {
- "role": MessageRole.SYSTEM,
- "content": [{"type": "text", "text": self.prompt_templates["planning"]["update_facts_pre_messages"]}],
- }
- facts_update_post = {
- "role": MessageRole.USER,
- "content": [{"type": "text", "text": self.prompt_templates["planning"]["update_facts_post_messages"]}],
- }
- input_messages = [facts_update_pre] + memory_messages + [facts_update_post]
- facts_message = self.model(input_messages)
-
- update_plan_pre = {
- "role": MessageRole.SYSTEM,
- "content": [
- {
- "type": "text",
- "text": populate_template(
- self.prompt_templates["planning"]["update_plan_pre_messages"], variables={"task": task}
- ),
- }
- ],
- }
- update_plan_post = {
- "role": MessageRole.USER,
- "content": [
- {
- "type": "text",
- "text": populate_template(
- self.prompt_templates["planning"]["update_plan_post_messages"],
- variables={
- "task": task,
- "tools": self.tools,
- "managed_agents": self.managed_agents,
- "facts_update": facts_message.content,
- "remaining_steps": (self.max_steps - step),
- },
- ),
- }
- ],
- }
- plan_message = self.model(
- [update_plan_pre] + memory_messages + [update_plan_post], stop_sequences=[""]
- )
- return input_messages, facts_message, plan_message
-
- def _record_planning_step(
- self, input_messages: list, facts_message: ChatMessage, plan_message: ChatMessage, is_first_step: bool
- ) -> None:
- if is_first_step:
- facts = textwrap.dedent(f"""Here are the facts that I know so far:\n```\n{facts_message.content}\n```""")
- plan = textwrap.dedent(
- f"""Here is the plan of action that I will follow to solve the task:\n```\n{plan_message.content}\n```"""
- )
- log_message = "Initial plan"
- else:
- facts = textwrap.dedent(
- f"""Here is the updated list of the facts that I know:\n```\n{facts_message.content}\n```"""
- )
+ }
+ input_messages = [plan_update_pre] + memory_messages + [plan_update_post]
+ plan_message = self.model(input_messages, stop_sequences=[""])
plan = textwrap.dedent(
- f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere is my new/updated plan of action to solve the task:\n```\n{plan_message.content}\n```"""
- )
- log_message = "Updated plan"
- self.memory.steps.append(
- PlanningStep(
- model_input_messages=input_messages,
- facts=facts,
- plan=plan,
- model_output_message_plan=plan_message,
- model_output_message_facts=facts_message,
+ f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere are the facts I know and my new/updated plan of action to solve the task:\n```\n{plan_message.content}\n```"""
)
+ log_headline = "Initial plan" if is_first_step else "Updated plan"
+ self.logger.log(Rule(f"[bold]{log_headline}", style="orange"), Text(plan), level=LogLevel.INFO)
+ return PlanningStep(
+ model_input_messages=input_messages,
+ plan=plan,
+ model_output_message=plan_message,
)
- self.logger.log(Rule(f"[bold]{log_message}", style="orange"), Text(plan), level=LogLevel.INFO)
@property
def logs(self):
@@ -512,14 +500,19 @@ def logs(self):
)
return [self.memory.system_prompt] + self.memory.steps
- def initialize_system_prompt(self):
+ @abstractmethod
+ def initialize_system_prompt(self) -> str:
"""To be implemented in child classes"""
- pass
+ ...
+
+ def interrupt(self):
+ """Interrupts the agent execution."""
+ self.interrupt_switch = True
def write_memory_to_messages(
self,
- summary_mode: Optional[bool] = False,
- ) -> List[Dict[str, str]]:
+ summary_mode: bool | None = False,
+ ) -> list[Message]:
"""
Reads past llm_outputs, actions, and observations or errors from the memory into a series of messages
that can be used as input to the LLM. Adds a number of keywords (such as PLAN, error, etc) to help
@@ -534,7 +527,7 @@ def visualize(self):
"""Creates a rich tree visualization of the agent's structure."""
self.logger.visualize_agent_tree(self)
- def extract_action(self, model_output: str, split_token: str) -> Tuple[str, str]:
+ def extract_action(self, model_output: str, split_token: str) -> tuple[str, str]:
"""
Parse action from the LLM output
@@ -555,13 +548,13 @@ def extract_action(self, model_output: str, split_token: str) -> Tuple[str, str]
)
return rationale.strip(), action.strip()
- def provide_final_answer(self, task: str, images: Optional[list[str]]) -> str:
+ def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None = None) -> str:
"""
Provide the final answer to the task, based on the logs of the agent's interactions.
Args:
task (`str`): Task to perform.
- images (`list[str]`, *optional*): Paths to image(s).
+ images (`list[PIL.Image.Image]`, *optional*): Image(s) objects.
Returns:
`str`: Final answer to the task.
@@ -599,54 +592,8 @@ def provide_final_answer(self, task: str, images: Optional[list[str]]) -> str:
except Exception as e:
return f"Error in generating final LLM output:\n{e}"
- def execute_tool_call(self, tool_name: str, arguments: Union[Dict[str, str], str]) -> Any:
- """
- Execute tool with the provided input and returns the result.
- This method replaces arguments with the actual values from the state if they refer to state variables.
-
- Args:
- tool_name (`str`): Name of the Tool to execute (should be one from self.tools).
- arguments (Dict[str, str]): Arguments passed to the Tool.
- """
- available_tools = {**self.tools, **self.managed_agents}
- if tool_name not in available_tools:
- error_msg = f"Unknown tool {tool_name}, should be instead one of {list(available_tools.keys())}."
- raise AgentExecutionError(error_msg, self.logger)
-
- try:
- if isinstance(arguments, str):
- if tool_name in self.managed_agents:
- observation = available_tools[tool_name].__call__(arguments)
- else:
- observation = available_tools[tool_name].__call__(arguments, sanitize_inputs_outputs=True)
- elif isinstance(arguments, dict):
- for key, value in arguments.items():
- if isinstance(value, str) and value in self.state:
- arguments[key] = self.state[value]
- if tool_name in self.managed_agents:
- observation = available_tools[tool_name].__call__(**arguments)
- else:
- observation = available_tools[tool_name].__call__(**arguments, sanitize_inputs_outputs=True)
- else:
- error_msg = f"Arguments passed to tool should be a dict or string: got a {type(arguments)}."
- raise AgentExecutionError(error_msg, self.logger)
- return observation
- except Exception as e:
- if tool_name in self.tools:
- tool = self.tools[tool_name]
- error_msg = (
- f"Error when executing tool {tool_name} with arguments {arguments}: {type(e).__name__}: {e}\nYou should only use this tool with a correct input.\n"
- f"As a reminder, this tool's description is the following: '{tool.description}'.\nIt takes inputs: {tool.inputs} and returns output type {tool.output_type}"
- )
- raise AgentExecutionError(error_msg, self.logger)
- elif tool_name in self.managed_agents:
- error_msg = (
- f"Error in calling team member: {e}\nYou should only ask this team member with a correct request.\n"
- f"As a reminder, this team member's description is the following:\n{available_tools[tool_name]}"
- )
- raise AgentExecutionError(error_msg, self.logger)
-
- def step(self, memory_step: ActionStep) -> Union[None, Any]:
+ @abstractmethod
+ def step(self, memory_step: ActionStep) -> None | Any:
"""To be implemented in children classes. Should return either None if the step is not final."""
pass
@@ -661,7 +608,6 @@ def replay(self, detailed: bool = False):
def __call__(self, task: str, **kwargs):
"""Adds additional prompting for the managed agent, runs it, and wraps the output.
-
This method is called only by a managed agent.
"""
full_task = populate_template(
@@ -680,7 +626,7 @@ def __call__(self, task: str, **kwargs):
answer += "\n"
return answer
- def save(self, output_dir: str, relative_path: Optional[str] = None):
+ def save(self, output_dir: str | Path, relative_path: str | None = None):
"""
Saves the relevant code files for your agent. This will copy the code of your agent in `output_dir` as well as autogenerate:
@@ -693,7 +639,7 @@ def save(self, output_dir: str, relative_path: Optional[str] = None):
code)
Args:
- output_dir (`str`): The folder in which you want to save your tool.
+ output_dir (`str` or `Path`): The folder in which you want to save your agent.
"""
make_init_file(output_dir)
@@ -730,6 +676,7 @@ def save(self, output_dir: str, relative_path: Optional[str] = None):
# Save agent dictionary to json
agent_dict = self.to_dict()
agent_dict["tools"] = [tool.name for tool in self.tools.values()]
+ agent_dict["managed_agents"] = {agent.name: agent.__class__.__name__ for agent in self.managed_agents.values()}
with open(os.path.join(output_dir, "agent.json"), "w", encoding="utf-8") as f:
json.dump(agent_dict, f, indent=4)
@@ -798,8 +745,12 @@ def save(self, output_dir: str, relative_path: Optional[str] = None):
with open(os.path.join(output_dir, "app.py"), "w", encoding="utf-8") as f:
f.write(app_text + "\n") # Append newline at the end
- def to_dict(self) -> Dict[str, Any]:
- """Converts agent into a dictionary."""
+ def to_dict(self) -> dict[str, Any]:
+ """Convert the agent to a dictionary representation.
+
+ Returns:
+ `dict`: Dictionary representation of the agent.
+ """
# TODO: handle serializing step_callbacks and final_answer_checks
for attr in ["final_answer_checks", "step_callbacks"]:
if getattr(self, attr, None):
@@ -817,14 +768,13 @@ def to_dict(self) -> Dict[str, Any]:
)
agent_dict = {
+ "class": self.__class__.__name__,
"tools": tool_dicts,
"model": {
"class": self.model.__class__.__name__,
"data": self.model.to_dict(),
},
- "managed_agents": {
- managed_agent.name: managed_agent.__class__.__name__ for managed_agent in self.managed_agents.values()
- },
+ "managed_agents": [managed_agent.to_dict() for managed_agent in self.managed_agents.values()],
"prompt_templates": self.prompt_templates,
"max_steps": self.max_steps,
"verbosity_level": int(self.logger.level),
@@ -832,21 +782,58 @@ def to_dict(self) -> Dict[str, Any]:
"planning_interval": self.planning_interval,
"name": self.name,
"description": self.description,
- "requirements": list(requirements),
+ "requirements": sorted(requirements),
}
- if hasattr(self, "authorized_imports"):
- agent_dict["authorized_imports"] = self.authorized_imports
- if hasattr(self, "use_e2b_executor"):
- agent_dict["use_e2b_executor"] = self.use_e2b_executor
- if hasattr(self, "max_print_outputs_length"):
- agent_dict["max_print_outputs_length"] = self.max_print_outputs_length
return agent_dict
+ @classmethod
+ def from_dict(cls, agent_dict: dict[str, Any], **kwargs) -> "MultiStepAgent":
+ """Create agent from a dictionary representation.
+
+ Args:
+ agent_dict (`dict[str, Any]`): Dictionary representation of the agent.
+ **kwargs: Additional keyword arguments that will override agent_dict values.
+
+ Returns:
+ `MultiStepAgent`: Instance of the agent class.
+ """
+ # Load model
+ model_info = agent_dict["model"]
+ model_class = getattr(importlib.import_module("smolagents.models"), model_info["class"])
+ model = model_class.from_dict(model_info["data"])
+ # Load tools
+ tools = []
+ for tool_info in agent_dict["tools"]:
+ tools.append(Tool.from_code(tool_info["code"]))
+ # Load managed agents
+ managed_agents = []
+ for managed_agent_name, managed_agent_class_name in agent_dict["managed_agents"].items():
+ managed_agent_class = getattr(importlib.import_module("smolagents.agents"), managed_agent_class_name)
+ managed_agents.append(managed_agent_class.from_dict(agent_dict["managed_agents"][managed_agent_name]))
+ # Extract base agent parameters
+ agent_args = {
+ "model": model,
+ "tools": tools,
+ "prompt_templates": agent_dict.get("prompt_templates"),
+ "max_steps": agent_dict.get("max_steps"),
+ "verbosity_level": agent_dict.get("verbosity_level"),
+ "grammar": agent_dict.get("grammar"),
+ "planning_interval": agent_dict.get("planning_interval"),
+ "name": agent_dict.get("name"),
+ "description": agent_dict.get("description"),
+ }
+ # Filter out None values to use defaults from __init__
+ agent_args = {k: v for k, v in agent_args.items() if v is not None}
+ # Update with any additional kwargs
+ agent_args.update(kwargs)
+ # Create agent instance
+ return cls(**agent_args)
+
@classmethod
def from_hub(
cls,
repo_id: str,
- token: Optional[str] = None,
+ token: str | None = None,
trust_remote_code: bool = False,
**kwargs,
):
@@ -897,54 +884,43 @@ def from_hub(
return cls.from_folder(download_folder, **kwargs)
@classmethod
- def from_folder(cls, folder: Union[str, Path], **kwargs):
+ def from_folder(cls, folder: str | Path, **kwargs):
"""Loads an agent from a local folder.
Args:
folder (`str` or `Path`): The folder where the agent is saved.
**kwargs: Additional keyword arguments that will be passed to the agent's init.
"""
+ # Load agent.json
folder = Path(folder)
agent_dict = json.loads((folder / "agent.json").read_text())
- # Recursively get managed agents
+ # Load managed agents from their respective folders, recursively
managed_agents = []
- for managed_agent_name, managed_agent_class in agent_dict["managed_agents"].items():
- agent_cls = getattr(importlib.import_module("smolagents.agents"), managed_agent_class)
+ for managed_agent_name, managed_agent_class_name in agent_dict["managed_agents"].items():
+ agent_cls = getattr(importlib.import_module("smolagents.agents"), managed_agent_class_name)
managed_agents.append(agent_cls.from_folder(folder / "managed_agents" / managed_agent_name))
+ agent_dict["managed_agents"] = {}
+ # Load tools
tools = []
for tool_name in agent_dict["tools"]:
tool_code = (folder / "tools" / f"{tool_name}.py").read_text()
- tools.append(Tool.from_code(tool_code))
+ tools.append({"name": tool_name, "code": tool_code})
+ agent_dict["tools"] = tools
- model_class: Model = getattr(importlib.import_module("smolagents.models"), agent_dict["model"]["class"])
- model = model_class.from_dict(agent_dict["model"]["data"])
+ # Add managed agents to kwargs to override the empty list in from_dict
+ if managed_agents:
+ kwargs["managed_agents"] = managed_agents
- args = dict(
- model=model,
- tools=tools,
- managed_agents=managed_agents,
- name=agent_dict["name"],
- description=agent_dict["description"],
- max_steps=agent_dict["max_steps"],
- planning_interval=agent_dict["planning_interval"],
- grammar=agent_dict["grammar"],
- verbosity_level=agent_dict["verbosity_level"],
- )
- if cls.__name__ == "CodeAgent":
- args["additional_authorized_imports"] = agent_dict["authorized_imports"]
- args["use_e2b_executor"] = agent_dict["use_e2b_executor"]
- args["max_print_outputs_length"] = agent_dict["max_print_outputs_length"]
- args.update(kwargs)
- return cls(**args)
+ return cls.from_dict(agent_dict, **kwargs)
def push_to_hub(
self,
repo_id: str,
commit_message: str = "Upload agent",
- private: Optional[bool] = None,
- token: Optional[Union[bool, str]] = None,
+ private: bool | None = None,
+ token: bool | str | None = None,
create_pr: bool = False,
) -> str:
"""
@@ -1008,10 +984,10 @@ class ToolCallingAgent(MultiStepAgent):
def __init__(
self,
- tools: List[Tool],
- model: Callable[[List[Dict[str, str]]], ChatMessage],
- prompt_templates: Optional[PromptTemplates] = None,
- planning_interval: Optional[int] = None,
+ tools: list[Tool],
+ model: Callable[[list[dict[str, str]]], ChatMessage],
+ prompt_templates: PromptTemplates | None = None,
+ planning_interval: int | None = None,
**kwargs,
):
prompt_templates = prompt_templates or yaml.safe_load(
@@ -1032,34 +1008,49 @@ def initialize_system_prompt(self) -> str:
)
return system_prompt
- def step(self, memory_step: ActionStep) -> Union[None, Any]:
+ def step(self, memory_step: ActionStep) -> None | Any:
"""
Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
Returns None if the step is not final.
"""
memory_messages = self.write_memory_to_messages()
- self.input_messages = memory_messages
+ input_messages = memory_messages.copy()
# Add new step in logs
- memory_step.model_input_messages = memory_messages.copy()
+ memory_step.model_input_messages = input_messages
try:
- model_message: ChatMessage = self.model(
- memory_messages,
+ chat_message: ChatMessage = self.model(
+ input_messages,
+ stop_sequences=["Observation:", "Calling tools:"],
tools_to_call_from=list(self.tools.values()),
- stop_sequences=["Observation:"],
)
- memory_step.model_output_message = model_message
- if model_message.tool_calls is None or len(model_message.tool_calls) == 0:
- raise Exception("Model did not call any tools. Call `final_answer` tool to return a final answer.")
- tool_call = model_message.tool_calls[0]
- tool_name, tool_call_id = tool_call.function.name, tool_call.id
- tool_arguments = tool_call.function.arguments
+ memory_step.model_output_message = chat_message
+ model_output = chat_message.content
+ self.logger.log_markdown(
+ content=model_output if model_output else str(chat_message.raw),
+ title="Output message of the LLM:",
+ level=LogLevel.DEBUG,
+ )
+ memory_step.model_output_message.content = model_output
+ memory_step.model_output = model_output
except Exception as e:
- raise AgentGenerationError(f"Error in generating tool call with model:\n{e}", self.logger) from e
+ raise AgentGenerationError(f"Error while generating output:\n{e}", self.logger) from e
+ if chat_message.tool_calls is None or len(chat_message.tool_calls) == 0:
+ try:
+ chat_message = self.model.parse_tool_calls(chat_message)
+ except Exception as e:
+ raise AgentParsingError(f"Error while parsing tool call from model output: {e}", self.logger)
+ else:
+ for tool_call in chat_message.tool_calls:
+ tool_call.function.arguments = parse_json_if_needed(tool_call.function.arguments)
+ tool_call = chat_message.tool_calls[0] # type: ignore
+ tool_name, tool_call_id = tool_call.function.name, tool_call.id
+ tool_arguments = tool_call.function.arguments
+ memory_step.model_output = str(f"Called Tool: '{tool_name}' with arguments: {tool_arguments}")
memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)]
# Execute
@@ -1115,6 +1106,79 @@ def step(self, memory_step: ActionStep) -> Union[None, Any]:
memory_step.observations = updated_information
return None
+ def _substitute_state_variables(self, arguments: dict[str, str] | str) -> dict[str, Any] | str:
+ """Replace string values in arguments with their corresponding state values if they exist."""
+ if isinstance(arguments, dict):
+ return {
+ key: self.state.get(value, value) if isinstance(value, str) else value
+ for key, value in arguments.items()
+ }
+ return arguments
+
+ def execute_tool_call(self, tool_name: str, arguments: dict[str, str] | str) -> Any:
+ """
+ Execute a tool or managed agent with the provided arguments.
+
+ The arguments are replaced with the actual values from the state if they refer to state variables.
+
+ Args:
+ tool_name (`str`): Name of the tool or managed agent to execute.
+ arguments (dict[str, str] | str): Arguments passed to the tool call.
+ """
+ # Check if the tool exists
+ available_tools = {**self.tools, **self.managed_agents}
+ if tool_name not in available_tools:
+ raise AgentToolExecutionError(
+ f"Unknown tool {tool_name}, should be one of: {', '.join(available_tools)}.", self.logger
+ )
+
+ # Get the tool and substitute state variables in arguments
+ tool = available_tools[tool_name]
+ arguments = self._substitute_state_variables(arguments)
+ is_managed_agent = tool_name in self.managed_agents
+
+ try:
+ # Call tool with appropriate arguments
+ if isinstance(arguments, dict):
+ return tool(**arguments) if is_managed_agent else tool(**arguments, sanitize_inputs_outputs=True)
+ elif isinstance(arguments, str):
+ return tool(arguments) if is_managed_agent else tool(arguments, sanitize_inputs_outputs=True)
+ else:
+ raise TypeError(f"Unsupported arguments type: {type(arguments)}")
+
+ except TypeError as e:
+ # Handle invalid arguments
+ description = getattr(tool, "description", "No description")
+ if is_managed_agent:
+ error_msg = (
+ f"Invalid request to team member '{tool_name}' with arguments {json.dumps(arguments)}: {e}\n"
+ "You should call this team member with a valid request.\n"
+ f"Team member description: {description}"
+ )
+ else:
+ error_msg = (
+ f"Invalid call to tool '{tool_name}' with arguments {json.dumps(arguments)}: {e}\n"
+ "You should call this tool with correct input arguments.\n"
+ f"Expected inputs: {json.dumps(tool.inputs)}\n"
+ f"Returns output type: {tool.output_type}\n"
+ f"Tool description: '{description}'"
+ )
+ raise AgentToolCallError(error_msg, self.logger) from e
+
+ except Exception as e:
+ # Handle execution errors
+ if is_managed_agent:
+ error_msg = (
+ f"Error executing request to team member '{tool_name}' with arguments {json.dumps(arguments)}: {e}\n"
+ "Please try again or request to another team member"
+ )
+ else:
+ error_msg = (
+ f"Error executing tool '{tool_name}' with arguments {json.dumps(arguments)}: {type(e).__name__}: {e}\n"
+ "Please try again or use another tool"
+ )
+ raise AgentToolExecutionError(error_msg, self.logger) from e
+
class CodeAgent(MultiStepAgent):
"""
@@ -1122,32 +1186,34 @@ class CodeAgent(MultiStepAgent):
Args:
tools (`list[Tool]`): [`Tool`]s that the agent can use.
- model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions.
+ model (`Model`): Model that will generate the agent's actions.
prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates.
grammar (`dict[str, str]`, *optional*): Grammar used to parse the LLM output.
additional_authorized_imports (`list[str]`, *optional*): Additional authorized imports for the agent.
planning_interval (`int`, *optional*): Interval at which the agent will run a planning step.
- use_e2b_executor (`bool`, default `False`): Whether to use the E2B executor for remote code execution.
+ executor_type (`str`, default `"local"`): Which executor type to use between `"local"`, `"e2b"`, or `"docker"`.
+ executor_kwargs (`dict`, *optional*): Additional arguments to pass to initialize the executor.
max_print_outputs_length (`int`, *optional*): Maximum length of the print outputs.
+ stream_outputs (`bool`, *optional*, default `False`): Whether to stream outputs during execution.
**kwargs: Additional keyword arguments.
-
"""
def __init__(
self,
- tools: List[Tool],
- model: Callable[[List[Dict[str, str]]], ChatMessage],
- prompt_templates: Optional[PromptTemplates] = None,
- grammar: Optional[Dict[str, str]] = None,
- additional_authorized_imports: Optional[List[str]] = None,
- planning_interval: Optional[int] = None,
- use_e2b_executor: bool = False,
- max_print_outputs_length: Optional[int] = None,
+ tools: list[Tool],
+ model: Model,
+ prompt_templates: PromptTemplates | None = None,
+ grammar: dict[str, str] | None = None,
+ additional_authorized_imports: list[str] | None = None,
+ planning_interval: int | None = None,
+ executor_type: str | None = "local",
+ executor_kwargs: dict[str, Any] | None = None,
+ max_print_outputs_length: int | None = None,
+ stream_outputs: bool = False,
**kwargs,
):
self.additional_authorized_imports = additional_authorized_imports if additional_authorized_imports else []
- self.authorized_imports = list(set(BASE_BUILTIN_MODULES) | set(self.additional_authorized_imports))
- self.use_e2b_executor = use_e2b_executor
+ self.authorized_imports = sorted(set(BASE_BUILTIN_MODULES) | set(self.additional_authorized_imports))
self.max_print_outputs_length = max_print_outputs_length
prompt_templates = prompt_templates or yaml.safe_load(
importlib.resources.files("smolagents.prompts").joinpath("code_agent.yaml").read_text()
@@ -1160,30 +1226,36 @@ def __init__(
planning_interval=planning_interval,
**kwargs,
)
+ self.stream_outputs = stream_outputs
+ if self.stream_outputs and not hasattr(self.model, "generate_stream"):
+ raise ValueError(
+ "`stream_outputs` is set to True, but the model class implements no `generate_stream` method."
+ )
if "*" in self.additional_authorized_imports:
self.logger.log(
"Caution: you set an authorization for all imports, meaning your agent can decide to import any package it deems necessary. This might raise issues if the package is not installed in your environment.",
- 0,
- )
-
- if use_e2b_executor and len(self.managed_agents) > 0:
- raise Exception(
- f"You passed both {use_e2b_executor=} and some managed agents. Managed agents is not yet supported with remote code execution."
- )
-
- all_tools = {**self.tools, **self.managed_agents}
- if use_e2b_executor:
- self.python_executor = E2BExecutor(
- self.additional_authorized_imports,
- list(all_tools.values()),
- self.logger,
- )
- else:
- self.python_executor = LocalPythonInterpreter(
- self.additional_authorized_imports,
- all_tools,
- max_print_outputs_length=max_print_outputs_length,
+ level=LogLevel.INFO,
)
+ self.executor_type = executor_type or "local"
+ self.executor_kwargs = executor_kwargs or {}
+ self.python_executor = self.create_python_executor()
+
+ def create_python_executor(self) -> PythonExecutor:
+ match self.executor_type:
+ case "e2b" | "docker":
+ if self.managed_agents:
+ raise Exception("Managed agents are not yet supported with remote code execution.")
+ if self.executor_type == "e2b":
+ return E2BExecutor(self.additional_authorized_imports, self.logger, **self.executor_kwargs)
+ else:
+ return DockerExecutor(self.additional_authorized_imports, self.logger, **self.executor_kwargs)
+ case "local":
+ return LocalPythonExecutor(
+ self.additional_authorized_imports,
+ max_print_outputs_length=self.max_print_outputs_length,
+ )
+ case _: # if applicable
+ raise ValueError(f"Unsupported executor type: {self.executor_type}")
def initialize_system_prompt(self) -> str:
system_prompt = populate_template(
@@ -1200,37 +1272,60 @@ def initialize_system_prompt(self) -> str:
)
return system_prompt
- def step(self, memory_step: ActionStep) -> Union[None, Any]:
+ def step(self, memory_step: ActionStep) -> None | Any:
"""
Perform one step in the ReAct framework: the agent thinks, acts, and observes the result.
Returns None if the step is not final.
"""
memory_messages = self.write_memory_to_messages()
- self.input_messages = memory_messages.copy()
-
- # Add new step in logs
- memory_step.model_input_messages = memory_messages.copy()
+ input_messages = memory_messages.copy()
+ ### Generate model output ###
+ memory_step.model_input_messages = input_messages
try:
additional_args = {"grammar": self.grammar} if self.grammar is not None else {}
- chat_message: ChatMessage = self.model(
- self.input_messages,
- stop_sequences=["", "Observation:"],
- **additional_args,
- )
- memory_step.model_output_message = chat_message
- model_output = chat_message.content
+ if self.stream_outputs:
+ output_stream = self.model.generate_stream(
+ input_messages,
+ stop_sequences=["", "Observation:", "Calling tools:"],
+ **additional_args,
+ )
+ output_text = ""
+ with Live("", console=self.logger.console, vertical_overflow="visible") as live:
+ for event in output_stream:
+ if event.content is not None:
+ output_text += event.content
+ live.update(Markdown(output_text))
+
+ model_output = output_text
+ chat_message = ChatMessage(role="assistant", content=model_output)
+ memory_step.model_output_message = chat_message
+ model_output = chat_message.content
+ else:
+ chat_message: ChatMessage = self.model(
+ input_messages,
+ stop_sequences=["", "Observation:", "Calling tools:"],
+ **additional_args,
+ )
+ memory_step.model_output_message = chat_message
+ model_output = chat_message.content
+ self.logger.log_markdown(
+ content=model_output,
+ title="Output message of the LLM:",
+ level=LogLevel.DEBUG,
+ )
+
+ # This adds sequence to the history.
+ # This will nudge ulterior LLM calls to finish with , thus efficiently stopping generation.
+ if model_output and model_output.strip().endswith("```"):
+ model_output += ""
+ memory_step.model_output_message.content = model_output
+
memory_step.model_output = model_output
except Exception as e:
raise AgentGenerationError(f"Error in generating model output:\n{e}", self.logger) from e
- self.logger.log_markdown(
- content=model_output,
- title="Output message of the LLM:",
- level=LogLevel.DEBUG,
- )
-
- # Parse
+ ### Parse output ###
try:
code_action = fix_final_answer_code(parse_code_blobs(model_output))
except Exception as e:
@@ -1245,14 +1340,11 @@ def step(self, memory_step: ActionStep) -> Union[None, Any]:
)
]
- # Execute
+ ### Execute action ###
self.logger.log_code(title="Executing parsed code:", content=code_action, level=LogLevel.INFO)
is_final_answer = False
try:
- output, execution_logs, is_final_answer = self.python_executor(
- code_action,
- self.state,
- )
+ output, execution_logs, is_final_answer = self.python_executor(code_action)
execution_outputs_console = []
if len(execution_logs) > 0:
execution_outputs_console += [
@@ -1291,3 +1383,41 @@ def step(self, memory_step: ActionStep) -> Union[None, Any]:
self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO)
memory_step.action_output = output
return output if is_final_answer else None
+
+ def to_dict(self) -> dict[str, Any]:
+ """Convert the agent to a dictionary representation.
+
+ Returns:
+ `dict`: Dictionary representation of the agent.
+ """
+ agent_dict = super().to_dict()
+ agent_dict["authorized_imports"] = self.authorized_imports
+ agent_dict["executor_type"] = self.executor_type
+ agent_dict["executor_kwargs"] = self.executor_kwargs
+ agent_dict["max_print_outputs_length"] = self.max_print_outputs_length
+ return agent_dict
+
+ @classmethod
+ def from_dict(cls, agent_dict: dict[str, Any], **kwargs) -> "CodeAgent":
+ """Create CodeAgent from a dictionary representation.
+
+ Args:
+ agent_dict (`dict[str, Any]`): Dictionary representation of the agent.
+ **kwargs: Additional keyword arguments that will override agent_dict values.
+
+ Returns:
+ `CodeAgent`: Instance of the CodeAgent class.
+ """
+ # Add CodeAgent-specific parameters to kwargs
+ code_agent_kwargs = {
+ "additional_authorized_imports": agent_dict.get("authorized_imports"),
+ "executor_type": agent_dict.get("executor_type"),
+ "executor_kwargs": agent_dict.get("executor_kwargs"),
+ "max_print_outputs_length": agent_dict.get("max_print_outputs_length"),
+ }
+ # Filter out None values
+ code_agent_kwargs = {k: v for k, v in code_agent_kwargs.items() if v is not None}
+ # Update with any additional kwargs
+ code_agent_kwargs.update(kwargs)
+ # Call the parent class's from_dict method
+ return super().from_dict(agent_dict, **code_agent_kwargs)
diff --git a/src/smolagents/cli.py b/src/smolagents/cli.py
index bcf984532..ccb8295ef 100644
--- a/src/smolagents/cli.py
+++ b/src/smolagents/cli.py
@@ -19,15 +19,15 @@
from dotenv import load_dotenv
-from smolagents import CodeAgent, HfApiModel, LiteLLMModel, Model, OpenAIServerModel, Tool, TransformersModel
+from smolagents import CodeAgent, InferenceClientModel, LiteLLMModel, Model, OpenAIServerModel, Tool, TransformersModel
from smolagents.default_tools import TOOL_MAPPING
leopard_prompt = "How many seconds would it take for a leopard at full speed to run through Pont des Arts?"
-def parse_arguments(description):
- parser = argparse.ArgumentParser(description=description)
+def parse_arguments():
+ parser = argparse.ArgumentParser(description="Run a CodeAgent with all specified parameters")
parser.add_argument(
"prompt",
type=str,
@@ -38,8 +38,8 @@ def parse_arguments(description):
parser.add_argument(
"--model-type",
type=str,
- default="HfApiModel",
- help="The model type to use (e.g., HfApiModel, OpenAIServerModel, LiteLLMModel, TransformersModel)",
+ default="InferenceClientModel",
+ help="The model type to use (e.g., InferenceClientModel, OpenAIServerModel, LiteLLMModel, TransformersModel)",
)
parser.add_argument(
"--model-id",
@@ -66,6 +66,12 @@ def parse_arguments(description):
help="The verbosity level, as an int in [0, 1, 2].",
)
group = parser.add_argument_group("api options", "Options for API-based model types")
+ group.add_argument(
+ "--provider",
+ type=str,
+ default=None,
+ help="The inference provider to use for the model",
+ )
group.add_argument(
"--api-base",
type=str,
@@ -79,7 +85,13 @@ def parse_arguments(description):
return parser.parse_args()
-def load_model(model_type: str, model_id: str, api_base: str | None, api_key: str | None) -> Model:
+def load_model(
+ model_type: str,
+ model_id: str,
+ api_base: str | None = None,
+ api_key: str | None = None,
+ provider: str | None = None,
+) -> Model:
if model_type == "OpenAIServerModel":
return OpenAIServerModel(
api_key=api_key or os.getenv("FIREWORKS_API_KEY"),
@@ -89,29 +101,37 @@ def load_model(model_type: str, model_id: str, api_base: str | None, api_key: st
elif model_type == "LiteLLMModel":
return LiteLLMModel(
model_id=model_id,
- api_key=api_key or os.getenv("OPENAI_API_KEY"),
+ api_key=api_key,
api_base=api_base,
)
elif model_type == "TransformersModel":
- return TransformersModel(model_id=model_id, device_map="auto", flatten_messages_as_text=False)
- elif model_type == "HfApiModel":
- return HfApiModel(
- token=api_key or os.getenv("HF_API_KEY"),
+ return TransformersModel(model_id=model_id, device_map="auto")
+ elif model_type == "InferenceClientModel":
+ return InferenceClientModel(
model_id=model_id,
+ token=api_key or os.getenv("HF_API_KEY"),
+ provider=provider,
)
else:
raise ValueError(f"Unsupported model type: {model_type}")
-def main():
+def run_smolagent(
+ prompt: str,
+ tools: list[str],
+ model_type: str,
+ model_id: str,
+ api_base: str | None = None,
+ api_key: str | None = None,
+ imports: list[str] | None = None,
+ provider: str | None = None,
+) -> None:
load_dotenv()
- args = parse_arguments(description="Run a CodeAgent with all specified parameters")
-
- model = load_model(args.model_type, args.model_id, args.api_base, args.api_key)
+ model = load_model(model_type, model_id, api_base=api_base, api_key=api_key, provider=provider)
available_tools = []
- for tool_name in args.tools:
+ for tool_name in tools:
if "/" in tool_name:
available_tools.append(Tool.from_space(tool_name))
else:
@@ -120,10 +140,24 @@ def main():
else:
raise ValueError(f"Tool {tool_name} is not recognized either as a default tool or a Space.")
- print(f"Running agent with these tools: {args.tools}")
- agent = CodeAgent(tools=available_tools, model=model, additional_authorized_imports=args.imports)
+ print(f"Running agent with these tools: {tools}")
+ agent = CodeAgent(tools=available_tools, model=model, additional_authorized_imports=imports)
+
+ agent.run(prompt)
- agent.run(args.prompt)
+
+def main() -> None:
+ args = parse_arguments()
+ run_smolagent(
+ args.prompt,
+ args.tools,
+ args.model_type,
+ args.model_id,
+ provider=args.provider,
+ api_base=args.api_base,
+ api_key=args.api_key,
+ imports=args.imports,
+ )
if __name__ == "__main__":
diff --git a/src/smolagents/default_tools.py b/src/smolagents/default_tools.py
index 2ea7834f6..d12a38d5a 100644
--- a/src/smolagents/default_tools.py
+++ b/src/smolagents/default_tools.py
@@ -14,9 +14,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import re
from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import Any
from .local_python_executor import (
BASE_BUILTIN_MODULES,
@@ -29,7 +28,7 @@
@dataclass
class PreTool:
name: str
- inputs: Dict[str, str]
+ inputs: dict[str, str]
output_type: type
task: str
description: str
@@ -57,7 +56,7 @@ def __init__(self, *args, authorized_imports=None, **kwargs):
"type": "string",
"description": (
"The code snippet to evaluate. All variables used in this snippet must be defined in this same snippet, "
- f"else you will get an error. This code can only import the following python libraries: {authorized_imports}."
+ f"else you will get an error. This code can only import the following python libraries: {self.authorized_imports}."
),
}
}
@@ -138,7 +137,7 @@ class GoogleSearchTool(Tool):
output_type = "string"
def __init__(self, provider: str = "serpapi"):
- super().__init__(self)
+ super().__init__()
import os
self.provider = provider
@@ -152,7 +151,7 @@ def __init__(self, provider: str = "serpapi"):
if self.api_key is None:
raise ValueError(f"Missing API key. Make sure you have '{api_key_env_name}' in your env variables.")
- def forward(self, query: str, filter_year: Optional[int] = None) -> str:
+ def forward(self, query: str, filter_year: int | None = None) -> str:
import requests
if self.provider == "serpapi":
@@ -224,8 +223,14 @@ class VisitWebpageTool(Tool):
}
output_type = "string"
+ def __init__(self, max_output_length: int = 40000):
+ super().__init__()
+ self.max_output_length = max_output_length
+
def forward(self, url: str) -> str:
try:
+ import re
+
import requests
from markdownify import markdownify
from requests.exceptions import RequestException
@@ -246,7 +251,7 @@ def forward(self, url: str) -> str:
# Remove multiple line breaks
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
- return truncate_content(markdown_content, 10000)
+ return truncate_content(markdown_content, self.max_output_length)
except requests.exceptions.Timeout:
return "The request timed out. Please try again later or check the URL."
@@ -256,6 +261,102 @@ def forward(self, url: str) -> str:
return f"An unexpected error occurred: {str(e)}"
+class WikipediaSearchTool(Tool):
+ """
+ WikipediaSearchTool searches Wikipedia and returns a summary or full text of the given topic, along with the page URL.
+
+ Attributes:
+ user_agent (str): A custom user-agent string to identify the project. This is required as per Wikipedia API policies, read more here: http://github.com/martin-majlis/Wikipedia-API/blob/master/README.rst
+ language (str): The language in which to retrieve Wikipedia articles.
+ http://meta.wikimedia.org/wiki/List_of_Wikipedias
+ content_type (str): Defines the content to fetch. Can be "summary" for a short summary or "text" for the full article.
+ extract_format (str): Defines the output format. Can be `"WIKI"` or `"HTML"`.
+
+ Example:
+ >>> from smolagents import CodeAgent, InferenceClientModel, WikipediaSearchTool
+ >>> agent = CodeAgent(
+ >>> tools=[
+ >>> WikipediaSearchTool(
+ >>> user_agent="MyResearchBot (myemail@example.com)",
+ >>> language="en",
+ >>> content_type="summary", # or "text"
+ >>> extract_format="WIKI",
+ >>> )
+ >>> ],
+ >>> model=InferenceClientModel(),
+ >>> )
+ >>> agent.run("Python_(programming_language)")
+ """
+
+ name = "wikipedia_search"
+ description = "Searches Wikipedia and returns a summary or full text of the given topic, along with the page URL."
+ inputs = {
+ "query": {
+ "type": "string",
+ "description": "The topic to search on Wikipedia.",
+ }
+ }
+ output_type = "string"
+
+ def __init__(
+ self,
+ user_agent: str = "Smolagents (myemail@example.com)",
+ language: str = "en",
+ content_type: str = "text",
+ extract_format: str = "WIKI",
+ ):
+ super().__init__()
+ try:
+ import wikipediaapi
+ except ImportError as e:
+ raise ImportError(
+ "You must install `wikipedia-api` to run this tool: for instance run `pip install wikipedia-api`"
+ ) from e
+ if not user_agent:
+ raise ValueError("User-agent is required. Provide a meaningful identifier for your project.")
+
+ self.user_agent = user_agent
+ self.language = language
+ self.content_type = content_type
+
+ # Map string format to wikipediaapi.ExtractFormat
+ extract_format_map = {
+ "WIKI": wikipediaapi.ExtractFormat.WIKI,
+ "HTML": wikipediaapi.ExtractFormat.HTML,
+ }
+
+ if extract_format not in extract_format_map:
+ raise ValueError("Invalid extract_format. Choose between 'WIKI' or 'HTML'.")
+
+ self.extract_format = extract_format_map[extract_format]
+
+ self.wiki = wikipediaapi.Wikipedia(
+ user_agent=self.user_agent, language=self.language, extract_format=self.extract_format
+ )
+
+ def forward(self, query: str) -> str:
+ try:
+ page = self.wiki.page(query)
+
+ if not page.exists():
+ return f"No Wikipedia page found for '{query}'. Try a different query."
+
+ title = page.title
+ url = page.fullurl
+
+ if self.content_type == "summary":
+ text = page.summary
+ elif self.content_type == "text":
+ text = page.text
+ else:
+ return "โ ๏ธ Invalid `content_type`. Use either 'summary' or 'text'."
+
+ return f"โ **Wikipedia Page:** {title}\n\n**Content:** {text}\n\n๐ **Read more:** {url}"
+
+ except Exception as e:
+ return f"Error fetching Wikipedia summary: {str(e)}"
+
+
class SpeechToTextTool(PipelineTool):
default_checkpoint = "openai/whisper-large-v3-turbo"
description = "This is a tool that transcribes an audio into text. It returns the transcribed text."
@@ -307,5 +408,6 @@ def decode(self, outputs):
"DuckDuckGoSearchTool",
"GoogleSearchTool",
"VisitWebpageTool",
+ "WikipediaSearchTool",
"SpeechToTextTool",
]
diff --git a/src/smolagents/e2b_executor.py b/src/smolagents/e2b_executor.py
deleted file mode 100644
index 10b0170ee..000000000
--- a/src/smolagents/e2b_executor.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import base64
-import pickle
-import re
-import textwrap
-from io import BytesIO
-from typing import Any, List, Tuple
-
-from PIL import Image
-
-from .tool_validation import validate_tool_attributes
-from .tools import Tool
-from .utils import BASE_BUILTIN_MODULES, instance_to_source
-
-
-try:
- from dotenv import load_dotenv
-
- load_dotenv()
-except ModuleNotFoundError:
- pass
-
-
-class E2BExecutor:
- def __init__(self, additional_imports: List[str], tools: List[Tool], logger):
- self.logger = logger
- try:
- from e2b_code_interpreter import Sandbox
- except ModuleNotFoundError:
- raise ModuleNotFoundError(
- """Please install 'e2b' extra to use E2BExecutor: `pip install "smolagents[e2b]"`"""
- )
- self.logger = logger
- self.logger.log("Initializing E2B executor, hold on...")
-
- self.custom_tools = {}
- self.final_answer = False
- self.final_answer_pattern = re.compile(r"final_answer\((.*?)\)")
- self.sbx = Sandbox() # "qywp2ctmu2q7jzprcf4j")
- # TODO: validate installing agents package or not
- # print("Installing agents package on remote executor...")
- # self.sbx.commands.run(
- # "pip install git+https://github.com/huggingface/smolagents.git",
- # timeout=300
- # )
- # print("Installation of agents package finished.")
- additional_imports = additional_imports + ["smolagents"]
- if len(additional_imports) > 0:
- execution = self.sbx.commands.run("pip install " + " ".join(additional_imports))
- if execution.error:
- raise Exception(f"Error installing dependencies: {execution.error}")
- else:
- logger.log(f"Installation of {additional_imports} succeeded!", 0)
-
- tool_codes = []
- for tool in tools:
- validate_tool_attributes(tool.__class__, check_imports=False)
- tool_code = instance_to_source(tool, base_cls=Tool)
- tool_code = tool_code.replace("from smolagents.tools import Tool", "")
- tool_code += f"\n{tool.name} = {tool.__class__.__name__}()\n"
- tool_codes.append(tool_code)
-
- tool_definition_code = "\n".join([f"import {module}" for module in BASE_BUILTIN_MODULES])
- tool_definition_code += textwrap.dedent(
- """
- class Tool:
- def __call__(self, *args, **kwargs):
- return self.forward(*args, **kwargs)
-
- def forward(self, *args, **kwargs):
- pass # to be implemented in child class
- """
- )
- tool_definition_code += "\n\n".join(tool_codes)
-
- tool_definition_execution = self.run_code_raise_errors(tool_definition_code)
- self.logger.log(tool_definition_execution.logs)
-
- def run_code_raise_errors(self, code: str):
- if self.final_answer_pattern.search(code) is not None:
- self.final_answer = True
- execution = self.sbx.run_code(
- code,
- )
- if execution.error:
- execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
- logs = execution_logs
- logs += "Executing code yielded an error:"
- logs += execution.error.name
- logs += execution.error.value
- logs += execution.error.traceback
- raise ValueError(logs)
- return execution
-
- def __call__(self, code_action: str, additional_args: dict) -> Tuple[Any, Any]:
- if len(additional_args) > 0:
- # Pickle additional_args to server
- import tempfile
-
- with tempfile.NamedTemporaryFile() as f:
- pickle.dump(additional_args, f)
- f.flush()
- with open(f.name, "rb") as file:
- self.sbx.files.write("/home/state.pkl", file)
- remote_unloading_code = """import pickle
-import os
-print("File path", os.path.getsize('/home/state.pkl'))
-with open('/home/state.pkl', 'rb') as f:
- pickle_dict = pickle.load(f)
-locals().update({key: value for key, value in pickle_dict.items()})
-"""
- execution = self.run_code_raise_errors(remote_unloading_code)
- execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
- self.logger.log(execution_logs, 1)
-
- execution = self.run_code_raise_errors(code_action)
- execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
- if not execution.results:
- return None, execution_logs, self.final_answer
- else:
- for result in execution.results:
- if result.is_main_result:
- for attribute_name in ["jpeg", "png"]:
- if getattr(result, attribute_name) is not None:
- image_output = getattr(result, attribute_name)
- decoded_bytes = base64.b64decode(image_output.encode("utf-8"))
- return Image.open(BytesIO(decoded_bytes)), execution_logs, self.final_answer
- for attribute_name in [
- "chart",
- "data",
- "html",
- "javascript",
- "json",
- "latex",
- "markdown",
- "pdf",
- "svg",
- "text",
- ]:
- if getattr(result, attribute_name) is not None:
- return getattr(result, attribute_name), execution_logs, self.final_answer
- if self.final_answer:
- raise ValueError("No main result returned by executor!")
- return None, execution_logs, False
-
-
-__all__ = ["E2BExecutor"]
diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py
index 11094a52c..83fbaff3d 100644
--- a/src/smolagents/gradio_ui.py
+++ b/src/smolagents/gradio_ui.py
@@ -16,23 +16,39 @@
import os
import re
import shutil
-from typing import Optional
-from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
-from smolagents.agents import ActionStep, MultiStepAgent
-from smolagents.memory import MemoryStep
+from smolagents.agent_types import AgentAudio, AgentImage, AgentText
+from smolagents.agents import MultiStepAgent, PlanningStep
+from smolagents.memory import ActionStep, FinalAnswerStep, MemoryStep
from smolagents.utils import _is_package_available
+def get_step_footnote_content(step_log: MemoryStep, step_name: str) -> str:
+ """Get a footnote string for a step log with duration and token information"""
+ step_footnote = f"**{step_name}**"
+ if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"):
+ token_str = f" | Input tokens:{step_log.input_token_count:,} | Output tokens: {step_log.output_token_count:,}"
+ step_footnote += token_str
+ if hasattr(step_log, "duration"):
+ step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None
+ step_footnote += step_duration
+ step_footnote_content = f"""{step_footnote} """
+ return step_footnote_content
+
+
def pull_messages_from_step(
step_log: MemoryStep,
):
"""Extract ChatMessage objects from agent steps with proper nesting"""
+ if not _is_package_available("gradio"):
+ raise ModuleNotFoundError(
+ "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
+ )
import gradio as gr
if isinstance(step_log, ActionStep):
# Output the step number
- step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else ""
+ step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else "Step"
yield gr.ChatMessage(role="assistant", content=f"**{step_number}**")
# First yield the thought/reasoning from the LLM
@@ -74,76 +90,98 @@ def pull_messages_from_step(
metadata={
"title": f"๐ ๏ธ Used tool {first_tool_call.name}",
"id": parent_id,
- "status": "pending",
+ "status": "done",
},
)
yield parent_message_tool
- # Nesting execution logs under the tool call if they exist
- if hasattr(step_log, "observations") and (
- step_log.observations is not None and step_log.observations.strip()
- ): # Only yield execution logs if there's actual content
- log_content = step_log.observations.strip()
- if log_content:
- log_content = re.sub(r"^Execution logs:\s*", "", log_content)
- yield gr.ChatMessage(
- role="assistant",
- content=f"{log_content}",
- metadata={"title": "๐ Execution Logs", "parent_id": parent_id, "status": "done"},
- )
-
- # Nesting any errors under the tool call
- if hasattr(step_log, "error") and step_log.error is not None:
+ # Display execution logs if they exist
+ if hasattr(step_log, "observations") and (
+ step_log.observations is not None and step_log.observations.strip()
+ ): # Only yield execution logs if there's actual content
+ log_content = step_log.observations.strip()
+ if log_content:
+ log_content = re.sub(r"^Execution logs:\s*", "", log_content)
yield gr.ChatMessage(
role="assistant",
- content=str(step_log.error),
- metadata={"title": "๐ฅ Error", "parent_id": parent_id, "status": "done"},
+ content=f"```bash\n{log_content}\n",
+ metadata={"title": "๐ Execution Logs", "status": "done"},
)
- # Update parent message metadata to done status without yielding a new message
- parent_message_tool.metadata["status"] = "done"
+ # Display any errors
+ if hasattr(step_log, "error") and step_log.error is not None:
+ yield gr.ChatMessage(
+ role="assistant",
+ content=str(step_log.error),
+ metadata={"title": "๐ฅ Error", "status": "done"},
+ )
+
+ # Update parent message metadata to done status without yielding a new message
+ if getattr(step_log, "observations_images", []):
+ for image in step_log.observations_images:
+ path_image = AgentImage(image).to_string()
+ yield gr.ChatMessage(
+ role="assistant",
+ content={"path": path_image, "mime_type": f"image/{path_image.split('.')[-1]}"},
+ metadata={"title": "๐ผ๏ธ Output Image", "status": "done"},
+ )
# Handle standalone errors but not from tool calls
- elif hasattr(step_log, "error") and step_log.error is not None:
+ if hasattr(step_log, "error") and step_log.error is not None:
yield gr.ChatMessage(role="assistant", content=str(step_log.error), metadata={"title": "๐ฅ Error"})
- # Calculate duration and token information
- step_footnote = f"{step_number}"
- if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"):
- token_str = (
- f" | Input-tokens:{step_log.input_token_count:,} | Output-tokens:{step_log.output_token_count:,}"
+ yield gr.ChatMessage(role="assistant", content=get_step_footnote_content(step_log, step_number))
+ yield gr.ChatMessage(role="assistant", content="-----", metadata={"status": "done"})
+
+ elif isinstance(step_log, PlanningStep):
+ yield gr.ChatMessage(role="assistant", content="**Planning step**")
+ yield gr.ChatMessage(role="assistant", content=step_log.plan)
+ yield gr.ChatMessage(role="assistant", content=get_step_footnote_content(step_log, "Planning step"))
+ yield gr.ChatMessage(role="assistant", content="-----", metadata={"status": "done"})
+
+ elif isinstance(step_log, FinalAnswerStep):
+ final_answer = step_log.final_answer
+ if isinstance(final_answer, AgentText):
+ yield gr.ChatMessage(
+ role="assistant",
+ content=f"**Final answer:**\n{final_answer.to_string()}\n",
+ )
+ elif isinstance(final_answer, AgentImage):
+ yield gr.ChatMessage(
+ role="assistant",
+ content={"path": final_answer.to_string(), "mime_type": "image/png"},
)
- step_footnote += token_str
- if hasattr(step_log, "duration"):
- step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None
- step_footnote += step_duration
- step_footnote = f"""{step_footnote} """
- yield gr.ChatMessage(role="assistant", content=f"{step_footnote}")
- yield gr.ChatMessage(role="assistant", content="-----")
+ elif isinstance(final_answer, AgentAudio):
+ yield gr.ChatMessage(
+ role="assistant",
+ content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
+ )
+ else:
+ yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
+
+ else:
+ raise ValueError(f"Unsupported step type: {type(step_log)}")
def stream_to_gradio(
agent,
task: str,
+ task_images: list | None = None,
reset_agent_memory: bool = False,
- additional_args: Optional[dict] = None,
+ additional_args: dict | None = None,
):
"""Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
- if not _is_package_available("gradio"):
- raise ModuleNotFoundError(
- "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`"
- )
- import gradio as gr
-
total_input_tokens = 0
total_output_tokens = 0
- for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args):
+ for step_log in agent.run(
+ task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args
+ ):
# Track tokens if model provides them
if getattr(agent.model, "last_input_token_count", None) is not None:
total_input_tokens += agent.model.last_input_token_count
total_output_tokens += agent.model.last_output_token_count
- if isinstance(step_log, ActionStep):
+ if isinstance(step_log, (ActionStep, PlanningStep)):
step_log.input_token_count = agent.model.last_input_token_count
step_log.output_token_count = agent.model.last_output_token_count
@@ -152,27 +190,6 @@ def stream_to_gradio(
):
yield message
- final_answer = step_log # Last log is the run's final_answer
- final_answer = handle_agent_output_types(final_answer)
-
- if isinstance(final_answer, AgentText):
- yield gr.ChatMessage(
- role="assistant",
- content=f"**Final answer:**\n{final_answer.to_string()}\n",
- )
- elif isinstance(final_answer, AgentImage):
- yield gr.ChatMessage(
- role="assistant",
- content={"path": final_answer.to_string(), "mime_type": "image/png"},
- )
- elif isinstance(final_answer, AgentAudio):
- yield gr.ChatMessage(
- role="assistant",
- content={"path": final_answer.to_string(), "mime_type": "audio/wav"},
- )
- else:
- yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}")
-
class GradioUI:
"""A one-line interface to launch your agent in Gradio"""
@@ -184,19 +201,32 @@ def __init__(self, agent: MultiStepAgent, file_upload_folder: str | None = None)
)
self.agent = agent
self.file_upload_folder = file_upload_folder
+ self.name = getattr(agent, "name") or "Agent interface"
+ self.description = getattr(agent, "description", None)
if self.file_upload_folder is not None:
if not os.path.exists(file_upload_folder):
os.mkdir(file_upload_folder)
- def interact_with_agent(self, prompt, messages):
+ def interact_with_agent(self, prompt, messages, session_state):
import gradio as gr
- messages.append(gr.ChatMessage(role="user", content=prompt))
- yield messages
- for msg in stream_to_gradio(self.agent, task=prompt, reset_agent_memory=False):
- messages.append(msg)
+ # Get the agent type from the template agent
+ if "agent" not in session_state:
+ session_state["agent"] = self.agent
+
+ try:
+ messages.append(gr.ChatMessage(role="user", content=prompt))
+ yield messages
+
+ for msg in stream_to_gradio(session_state["agent"], task=prompt, reset_agent_memory=False):
+ messages.append(msg)
+ yield messages
+
+ yield messages
+ except Exception as e:
+ print(f"Error in interaction: {str(e)}")
+ messages.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}"))
yield messages
- yield messages
def upload_file(self, file, file_uploads_log, allowed_file_types=None):
"""
@@ -227,6 +257,8 @@ def upload_file(self, file, file_uploads_log, allowed_file_types=None):
return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
def log_user_message(self, text_input, file_uploads_log):
+ import gradio as gr
+
return (
text_input
+ (
@@ -235,14 +267,56 @@ def log_user_message(self, text_input, file_uploads_log):
else ""
),
"",
+ gr.Button(interactive=False),
)
- def launch(self, share: bool = False, **kwargs):
+ def launch(self, share: bool = True, **kwargs):
+ self.create_app().launch(debug=True, share=share, **kwargs)
+
+ def create_app(self):
import gradio as gr
- with gr.Blocks(fill_height=True) as demo:
+ with gr.Blocks(theme="ocean", fill_height=True) as demo:
+ # Add session state to store session-specific data
+ session_state = gr.State({})
stored_messages = gr.State([])
file_uploads_log = gr.State([])
+
+ with gr.Sidebar():
+ gr.Markdown(
+ f"# {self.name.replace('_', ' ').capitalize()}"
+ "\n> This web ui allows you to interact with a `smolagents` agent that can use tools and execute steps to complete tasks."
+ + (f"\n\n**Agent description:**\n{self.description}" if self.description else "")
+ )
+
+ with gr.Group():
+ gr.Markdown("**Your request**", container=True)
+ text_input = gr.Textbox(
+ lines=3,
+ label="Chat Message",
+ container=False,
+ placeholder="Enter your prompt here and press Shift+Enter or press the button",
+ )
+ submit_btn = gr.Button("Submit", variant="primary")
+
+ # If an upload folder is provided, enable the upload feature
+ if self.file_upload_folder is not None:
+ upload_file = gr.File(label="Upload a file")
+ upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False)
+ upload_file.change(
+ self.upload_file,
+ [upload_file, file_uploads_log],
+ [upload_status, file_uploads_log],
+ )
+
+ gr.HTML("