diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..158f0acc0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,26 @@ +--- +name: Bug report +about: The clearer your bug report, the faster it will be fixed! +title: "[BUG]" +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**Code to reproduce the error** +The simplest code snippet that produces your bug. + +**Error logs (if any)** +Provide error logs if there are any. + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Packages version:** +Run `pip freeze | grep smolagents` and paste it here. + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md new file mode 100644 index 000000000..48d5f81fa --- /dev/null +++ b/.github/ISSUE_TEMPLATE/custom.md @@ -0,0 +1,10 @@ +--- +name: Custom issue template +about: Describe this issue template's purpose here. +title: '' +labels: '' +assignees: '' + +--- + + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..7a6374252 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,23 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Is this not possible with the current options.** +Make sure to consider if what you're requesting can be done with current abstractions. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index ab96066c3..de6d3174b 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -20,6 +20,7 @@ jobs: commit_sha: ${{ github.sha }} package: smolagents languages: en + notebook_folder: smolagents_doc # additional_args: --not_python_module # use this arg if repository is documentation only secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b379dee63..c16a90a72 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,6 +34,11 @@ jobs: # Run all tests separately for individual feedback # Use 'if success() || failure()' so that all tests are run even if one failed # See https://stackoverflow.com/a/62112985 + - name: Import tests + run: | + uv run pytest ./tests/test_import.py + if: ${{ success() || failure() }} + - name: Agent tests run: | uv run pytest ./tests/test_agents.py @@ -58,14 +63,24 @@ jobs: uv run pytest ./tests/test_models.py if: ${{ success() || failure() }} + - name: Memory tests + run: | + uv run pytest ./tests/test_memory.py + if: ${{ success() || failure() }} + - name: Monitoring tests run: | uv run pytest ./tests/test_monitoring.py if: ${{ success() || failure() }} - - name: Python interpreter tests + - name: Local Python executor tests run: | - uv run pytest ./tests/test_python_interpreter.py + uv run pytest ./tests/test_local_python_executor.py + if: ${{ success() || failure() }} + + - name: E2B executor tests + run: | + uv run pytest ./tests/test_e2b_executor.py if: ${{ success() || failure() }} - name: Search tests @@ -78,6 +93,11 @@ jobs: uv run pytest ./tests/test_tools.py if: ${{ success() || failure() }} + - name: Tool validation tests + run: | + uv run pytest ./tests/test_tool_validation.py + if: ${{ success() || failure() }} + - name: Types tests run: | uv run pytest ./tests/test_types.py @@ -88,6 +108,11 @@ jobs: uv run pytest ./tests/test_utils.py if: ${{ success() || failure() }} + - name: Gradio UI tests + run: | + uv run pytest ./tests/test_gradio_ui.py + if: ${{ success() || failure() }} + - name: Function type hints utils tests run: | uv run pytest ./tests/test_function_type_hints_utils.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0e346b751..f63be4fcc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,19 +33,17 @@ However you choose to contribute, please be mindful and respect our There are several ways you can contribute to smolagents. -* Fix outstanding issues with the existing code. * Submit issues related to bugs or desired new features. * Contribute to the examples or to the documentation. +* Fix outstanding issues with the existing code. > All contributions are equally valuable to the community. 🥰 -## Fixing outstanding issues - -If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open -a Pull Request! - ## Submitting a bug-related issue or feature request +At any moment, feel welcome to open an issue, citing your exact error traces and package versions if it's a bug. +It's often even better to open a PR with your proposed fixes/changes! + Do your best to follow these guidelines when submitting a bug-related issue or a feature request. It will make it easier for us to come back to you quickly and with good feedback. @@ -89,10 +87,41 @@ We're always looking for improvements to the documentation that make it more cle how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution if you're interested! +## Fixing outstanding issues + +If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open +a Pull Request! + +### Making code changes + +To install dev dependencies, run: +``` +pip install -e ".[dev]" +``` + +When making changes to the codebase, please check that it follows the repo's code quality requirements by running: +To check code quality of the source code: +``` +make quality +``` + +If the checks fail, you can run the formatter with: +``` +make style +``` + +And commit the changes. + +To run tests locally, run this command: +```bash +make test +``` + + ## I want to become a maintainer of the project. How do I get there? smolagents is a project led and managed by Hugging Face. We are more than happy to have motivated individuals from other organizations join us as maintainers with the goal of helping smolagents make a dent in the world of Agents. -If you are such an individual (or organization), please reach out to us and let's collaborate. \ No newline at end of file +If you are such an individual (or organization), please reach out to us and let's collaborate. diff --git a/README.md b/README.md index 6da9d4d23..fb853b06e 100644 --- a/README.md +++ b/README.md @@ -25,21 +25,24 @@ limitations under the License.

- Hugging Face mascot as James Bond -

smolagents - a smol library to build great agents!

+ Hugging Face mascot as James Bond +

A smol library to build great agents!

`smolagents` is a library that enables you to run powerful agents in a few lines of code. It offers: -✨ **Simplicity**: the logic for agents fits in ~thousand lines of code (see [agents.py](https://github.com/huggingface/smolagents/blob/main/src/smolagents/agents.py)). We kept abstractions to their minimal shape above raw code! +✨ **Simplicity**: the logic for agents fits in ~1,000 lines of code (see [agents.py](https://github.com/huggingface/smolagents/blob/main/src/smolagents/agents.py)). We kept abstractions to their minimal shape above raw code! -🧑‍💻 **First-class support for Code Agents**, i.e. agents that write their actions in code (as opposed to "agents being used to write code"). To make it secure, we support executing in sandboxed environments via [E2B](https://e2b.dev/). - - On top of this [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) class, we still support the standard [`ToolCallingAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.ToolCallingAgent) that writes actions as JSON/text blobs. +🧑‍💻 **First-class support for Code Agents**. Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) writes its actions in code (as opposed to "agents being used to write code"). To make it secure, we support executing in sandboxed environments via [E2B](https://e2b.dev/). -🤗 **Hub integrations**: you can share and load Gradio Spaces as tools to/from the Hub, and more is to come! +🤗 **Hub integrations**: you can [share/pull tools to/from the Hub](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_hub), and more is to come! -🌐 **Support for any LLM**: it supports models hosted on the Hub loaded in their `transformers` version or through our inference API, but also supports models from OpenAI, Anthropic and many others via our [LiteLLM](https://www.litellm.ai/) integration. +🌐 **Model-agnostic**: smolagents supports any LLM. It can be a local `transformers` or `ollama` model, one of [many providers on the Hub](https://huggingface.co/blog/inference-providers), or any model from OpenAI, Anthropic and many others via our [LiteLLM](https://www.litellm.ai/) integration. + +👁️ **Modality-agnostic**: Agents support text, vision, video, even audio inputs! Cf [this tutorial](https://huggingface.co/docs/smolagents/examples/web_browser) for vision. + +🛠️ **Tool-agnostic**: you can use tools from [LangChain](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_langchain), [Anthropic's MCP](https://huggingface.co/docs/smolagents/reference/tools#smolagents.ToolCollection.from_mcp), you can even use a [Hub Space](https://huggingface.co/docs/smolagents/reference/tools#smolagents.Tool.from_space) as a tool. Full documentation can be found [here](https://huggingface.co/docs/smolagents/index). @@ -56,72 +59,176 @@ Then define your agent, give it the tools it needs and run it! ```py from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel -agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=HfApiModel()) +model = HfApiModel() +agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") ``` https://github.com/user-attachments/assets/cd0226e2-7479-4102-aea0-57c22ca47884 -## Code agents? +You can even share your agent to hub: +```py +agent.push_to_hub("m-ric/my_agent") -In our `CodeAgent`, the LLM engine writes its actions in code. This approach is demonstrated to work better than the current industry practice of letting the LLM output a dictionary of the tools it wants to calls: [uses 30% fewer steps](https://huggingface.co/papers/2402.01030) (thus 30% fewer LLM calls) -and [reaches higher performance on difficult benchmarks](https://huggingface.co/papers/2411.01747). Head to [our high-level intro to agents](https://huggingface.co/docs/smolagents/conceptual_guides/intro_agents) to learn more on that. +# agent.from_hub("m-ric/my_agent") to load an agent from Hub +``` -Especially, since code execution can be a security concern (arbitrary code execution!), we provide options at runtime: - - a secure python interpreter to run code more safely in your environment (more secure than raw code execution but still risky) - - a sandboxed environment using [E2B](https://e2b.dev/) (removes the risk to your own system). +Our library is LLM-agnostic: you could switch the example above to any inference provider. -## How smol is it really? +
+ HfApiModel, gateway for 4 inference providers -We strived to keep abstractions to a strict minimum: the main code in `agents.py` is only ~1,000 lines of code. -Still, we implement several types of agents: `CodeAgent` writes its actions as Python code snippets, and the more classic `ToolCallingAgent` leverages built-in tool calling methods. +```py +from smolagents import HfApiModel -By the way, why use a framework at all? Well, because a big part of this stuff is non-trivial. For instance, the code agent has to keep a consistent format for code throughout its system prompt, its parser, the execution. So our framework handles this complexity for you. But of course we still encourage you to hack into the source code and use only the bits that you need, to the exclusion of everything else! +model = HfApiModel( + model_id="deepseek-ai/DeepSeek-R1", + provider="together", +) +``` +
+
+ LiteLLM to access 100+ LLMs -## How strong are open models for agentic workflows? +```py +from smolagents import LiteLLMModel -We've created [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) instances with some leading models, and compared them on [this benchmark](https://huggingface.co/datasets/m-ric/agents_medium_benchmark_2) that gathers questions from a few different benchmarks to propose a varied blend of challenges. +model = LiteLLMModel( + "anthropic/claude-3-5-sonnet-latest", + temperature=0.2, + api_key=os.environ["ANTHROPIC_API_KEY"] +) +``` +
+
+ OpenAI-compatible servers -[Find the benchmarking code here](https://github.com/huggingface/smolagents/blob/main/examples/benchmark.ipynb) for more detail on the agentic setup used, and see a comparison of using LLMs code agents compared to vanilla (spoilers: code agents works better). +```py +import os +from smolagents import OpenAIServerModel + +model = OpenAIServerModel( + model_id="deepseek-ai/DeepSeek-R1", + api_base="https://api.together.xyz/v1/", # Leave this blank to query OpenAI servers. + api_key=os.environ["TOGETHER_API_KEY"], # Switch to the API key for the server you're targeting. +) +``` +
+
+ Local `transformers` model -

- benchmark of different models on agentic workflows -

+```py +from smolagents import TransformersModel -This comparison shows that open source models can now take on the best closed models! +model = TransformersModel( + model_id="Qwen/Qwen2.5-Coder-32B-Instruct", + max_new_tokens=4096, + device_map="auto" +) +``` +
+
+ Azure models -## Contributing +```py +import os +from smolagents import AzureOpenAIServerModel + +model = AzureOpenAIServerModel( + model_id = os.environ.get("AZURE_OPENAI_MODEL"), + azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"), + api_key=os.environ.get("AZURE_OPENAI_API_KEY"), + api_version=os.environ.get("OPENAI_API_VERSION") +) +``` +
-To contribute, follow our [contribution guide](https://github.com/huggingface/smolagents/blob/main/CONTRIBUTING.md). +## CLI -At any moment, feel welcome to open an issue, citing your exact error traces and package versions if it's a bug. -It's often even better to open a PR with your proposed fixes/changes! +You can run agents from CLI using two commands: `smolagent` and `webagent`. -To install dev dependencies, run: -``` -pip install -e ".[dev]" -``` +`smolagent` is a generalist command to run a multi-step `CodeAgent` that can be equipped with various tools. -When making changes to the codebase, please check that it follows the repo's code quality requirements by running: -To check code quality of the source code: +```bash +smolagent "Plan a trip to Tokyo, Kyoto and Osaka between Mar 28 and Apr 7." --model-type "HfApiModel" --model-id "Qwen/Qwen2.5-Coder-32B-Instruct" --imports "pandas numpy" --tools "web_search" ``` -make quality + +Meanwhile `webagent` is a specific web-browsing agent using [helium](https://github.com/mherrmann/helium) (read more [here](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py)). + +For instance: +```bash +webagent "go to xyz.com/men, get to sale section, click the first clothing item you see. Get the product details, and the price, return them. note that I'm shopping from France" --model-type "LiteLLMModel" --model-id "gpt-4o" ``` -If the checks fail, you can run the formatter with: +## How do Code agents work? + +Our [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) works mostly like classical ReAct agents - the exception being that the LLM engine writes its actions as Python code snippets. + +```mermaid +flowchart TB + Task[User Task] + Memory[agent.memory] + Generate[Generate from agent.model] + Execute[Execute Code action - Tool calls are written as functions] + Answer[Return the argument given to 'final_answer'] + + Task -->|Add task to agent.memory| Memory + + subgraph ReAct[ReAct loop] + Memory -->|Memory as chat messages| Generate + Generate -->|Parse output to extract code action| Execute + Execute -->|No call to 'final_answer' tool => Store execution logs in memory and keep running| Memory + end + + Execute -->|Call to 'final_answer' tool| Answer + + %% Styling + classDef default fill:#d4b702,stroke:#8b7701,color:#ffffff + classDef io fill:#4a5568,stroke:#2d3748,color:#ffffff + + class Task,Answer io ``` -make style + +Actions are now Python code snippets. Hence, tool calls will be performed as Python function calls. For instance, here is how the agent can perform web search over several websites in one single action: +```py +requests_to_search = ["gulf of mexico america", "greenland denmark", "tariffs"] +for request in requests_to_search: + print(f"Here are the search results for {request}:", web_search(request)) ``` -And commit the changes. +Writing actions as code snippets is demonstrated to work better than the current industry practice of letting the LLM output a dictionary of the tools it wants to call: [uses 30% fewer steps](https://huggingface.co/papers/2402.01030) (thus 30% fewer LLM calls) and [reaches higher performance on difficult benchmarks](https://huggingface.co/papers/2411.01747). Head to [our high-level intro to agents](https://huggingface.co/docs/smolagents/conceptual_guides/intro_agents) to learn more on that. -To run tests locally, run this command: -```bash -make test -``` +Especially, since code execution can be a security concern (arbitrary code execution!), we provide options at runtime: + - a secure python interpreter to run code more safely in your environment (more secure than raw code execution but still risky) + - a sandboxed environment using [E2B](https://e2b.dev/) (removes the risk to your own system). + +On top of this [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) class, we still support the standard [`ToolCallingAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.ToolCallingAgent) that writes actions as JSON/text blobs. But we recommend always using `CodeAgent`. + +## How smol is this library? + +We strived to keep abstractions to a strict minimum: the main code in `agents.py` has <1,000 lines of code. +Still, we implement several types of agents: `CodeAgent` writes its actions as Python code snippets, and the more classic `ToolCallingAgent` leverages built-in tool calling methods. We also have multi-agent hierarchies, import from tool collections, remote code execution, vision models... + +By the way, why use a framework at all? Well, because a big part of this stuff is non-trivial. For instance, the code agent has to keep a consistent format for code throughout its system prompt, its parser, the execution. So our framework handles this complexity for you. But of course we still encourage you to hack into the source code and use only the bits that you need, to the exclusion of everything else! + +## How strong are open models for agentic workflows? + +We've created [`CodeAgent`](https://huggingface.co/docs/smolagents/reference/agents#smolagents.CodeAgent) instances with some leading models, and compared them on [this benchmark](https://huggingface.co/datasets/m-ric/agents_medium_benchmark_2) that gathers questions from a few different benchmarks to propose a varied blend of challenges. + +[Find the benchmarking code here](https://github.com/huggingface/smolagents/blob/main/examples/benchmark.ipynb) for more detail on the agentic setup used, and see a comparison of using LLMs code agents compared to vanilla (spoilers: code agents works better). + +

+ benchmark of different models on agentic workflows. Open model DeepSeek-R1 beats closed-source models. +

+ +This comparison shows that open-source models can now take on the best closed models! + +## Contribute + +Everyone is welcome to contribute, get started with our [contribution guide](https://github.com/huggingface/smolagents/blob/main/CONTRIBUTING.md). -## Citing smolagents +## Cite smolagents If you use `smolagents` in your publication, please cite it by using the following BibTeX entry. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 71faa4d92..c1efd31dc 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -14,6 +14,8 @@ title: 🛠️ Tools - in-depth guide - local: tutorials/secure_code_execution title: 🛡️ Secure your code execution with E2B + - local: tutorials/memory + title: 📚 Manage your agent's memory - title: Conceptual guides sections: - local: conceptual_guides/intro_agents @@ -28,9 +30,13 @@ title: Master you knowledge base with agentic RAG - local: examples/multiagents title: Orchestrate a multi-agent system + - local: examples/web_browser + title: Build a web browser agent using vision models - title: Reference sections: - local: reference/agents title: Agent-related objects + - local: reference/models + title: Model-related objects - local: reference/tools title: Tool-related objects diff --git a/docs/source/en/conceptual_guides/intro_agents.md b/docs/source/en/conceptual_guides/intro_agents.mdx similarity index 93% rename from docs/source/en/conceptual_guides/intro_agents.md rename to docs/source/en/conceptual_guides/intro_agents.mdx index ce447ade2..ca5ad31c5 100644 --- a/docs/source/en/conceptual_guides/intro_agents.md +++ b/docs/source/en/conceptual_guides/intro_agents.mdx @@ -68,7 +68,7 @@ If that deterministic workflow fits all queries, by all means just code everythi But what if the workflow can't be determined that well in advance? -For instance, a user wants to ask : `"I can come on Monday, but I forgot my passport so risk being delayed to Wednesday, is it possible to take me and my stuff to surf on Tuesday morning, with a cancellation insurance?"` This question hinges on many factors, and probably none of the predetermined criteria above will suffice for this request. +For instance, a user wants to ask: `"I can come on Monday, but I forgot my passport so risk being delayed to Wednesday, is it possible to take me and my stuff to surf on Tuesday morning, with a cancellation insurance?"` This question hinges on many factors, and probably none of the predetermined criteria above will suffice for this request. If the pre-determined workflow falls short too often, that means you need more flexibility. @@ -83,8 +83,8 @@ Until recently, computer programs were restricted to pre-determined workflows, t For some low-level agentic use cases, like chains or routers, you can write all the code yourself. You'll be much better that way, since it will let you control and understand your system better. But once you start going for more complicated behaviours like letting an LLM call a function (that's "tool calling") or letting an LLM run a while loop ("multi-step agent"), some abstractions become necessary: -- for tool calling, you need to parse the agent's output, so this output needs a predefined format like "Thought: I should call tool 'get_weather'. Action: get_weather(Paris).", that you parse with a predefined function, and system prompt given to the LLM should notify it about this format. -- for a multi-step agent where the LLM output determines the loop, you need to give a different prompt to the LLM based on what happened in the last loop iteration: so you need some kind of memory. +- For tool calling, you need to parse the agent's output, so this output needs a predefined format like "Thought: I should call tool 'get_weather'. Action: get_weather(Paris).", that you parse with a predefined function, and system prompt given to the LLM should notify it about this format. +- For a multi-step agent where the LLM output determines the loop, you need to give a different prompt to the LLM based on what happened in the last loop iteration: so you need some kind of memory. See? With these two examples, we already found the need for a few items to help us: @@ -106,7 +106,7 @@ In a multi-step agent, at each step, the LLM can write an action, in the form of The reason for this simply that *we crafted our code languages specifically to be the best possible way to express actions performed by a computer*. If JSON snippets were a better expression, JSON would be the top programming language and programming would be hell on earth. -The figure below, taken from [Executable Code Actions Elicit Better LLM Agents](https://huggingface.co/papers/2402.01030), illustrate some advantages of writing actions in code: +The figure below, taken from [Executable Code Actions Elicit Better LLM Agents](https://huggingface.co/papers/2402.01030), illustrates some advantages of writing actions in code: @@ -115,4 +115,4 @@ Writing actions in code rather than JSON-like snippets provides better: - **Composability:** could you nest JSON actions within each other, or define a set of JSON actions to re-use later, the same way you could just define a python function? - **Object management:** how do you store the output of an action like `generate_image` in JSON? - **Generality:** code is built to express simply anything you can have a computer do. -- **Representation in LLM training data:** plenty of quality code actions is already included in LLMs’ training data which means they’re already trained for this! +- **Representation in LLM training data:** plenty of quality code actions are already included in LLMs’ training data which means they’re already trained for this! diff --git a/docs/source/en/conceptual_guides/react.md b/docs/source/en/conceptual_guides/react.mdx similarity index 82% rename from docs/source/en/conceptual_guides/react.md rename to docs/source/en/conceptual_guides/react.mdx index 417fb8590..b86c438e2 100644 --- a/docs/source/en/conceptual_guides/react.md +++ b/docs/source/en/conceptual_guides/react.mdx @@ -27,7 +27,7 @@ Initialization: the system prompt is stored in a `SystemPromptStep`, and the use While loop (ReAct loop): -- Use `agent.write_inner_memory_from_logs()` to write the agent logs into a list of LLM-readable [chat messages](https://huggingface.co/docs/transformers/en/chat_templating). +- Use `agent.write_memory_to_messages()` to write the agent logs into a list of LLM-readable [chat messages](https://huggingface.co/docs/transformers/en/chat_templating). - Send these messages to a `Model` object to get its completion. Parse the completion to get the action (a JSON blob for `ToolCallingAgent`, a code snippet for `CodeAgent`). - Execute the action and logs result into memory (an `ActionStep`). - At the end of each step, we run all callback functions defined in `agent.step_callbacks` . @@ -38,11 +38,6 @@ For a `CodeAgent`, it looks like the figure below.
-
@@ -60,14 +55,9 @@ Here is a video overview of how that works: /> -![Framework of a React Agent](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png) - We implement two versions of agents: - [`CodeAgent`] is the preferred type of agent: it generates its tool calls as blobs of code. - [`ToolCallingAgent`] generates tool calls as a JSON in its output, as is commonly done in agentic frameworks. We incorporate this option because it can be useful in some narrow cases where you can do fine with only one tool call per step: for instance, for web browsing, you need to wait after each action on the page to monitor how the page changes. > [!TIP] -> We also provide an option to run agents in one-shot: just pass `single_step=True` when launching the agent, like `agent.run(your_task, single_step=True)` - -> [!TIP] -> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about multi-step agents. \ No newline at end of file +> Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more about multi-step agents. diff --git a/docs/source/en/examples/multiagents.md b/docs/source/en/examples/multiagents.mdx similarity index 87% rename from docs/source/en/examples/multiagents.md rename to docs/source/en/examples/multiagents.mdx index c4bb51413..4f41fe8e6 100644 --- a/docs/source/en/examples/multiagents.md +++ b/docs/source/en/examples/multiagents.mdx @@ -19,7 +19,7 @@ rendered properly in your Markdown viewer. In this notebook we will make a **multi-agent web browser: an agentic system with several agents collaborating to solve problems using the web!** -It will be a simple hierarchy, using a `ManagedAgent` object to wrap the managed web search agent: +It will be a simple hierarchy: ``` +----------------+ @@ -28,15 +28,12 @@ It will be a simple hierarchy, using a `ManagedAgent` object to wrap the managed | _______________|______________ | | - Code interpreter +--------------------------------+ - tool | Managed agent | - | +------------------+ | - | | Web Search agent | | - | +------------------+ | - | | | | - | Web Search tool | | - | Visit webpage tool | - +--------------------------------+ +Code Interpreter +------------------+ + tool | Web Search agent | + +------------------+ + | | + Web Search tool | + Visit webpage tool ``` Let's set up this system. @@ -119,7 +116,7 @@ print(visit_webpage("https://en.wikipedia.org/wiki/Hugging_Face")[:500]) Now that we have all the tools `search` and `visit_webpage`, we can use them to create the web agent. Which configuration to choose for this agent? -- Web browsing is a single-timeline task that does not require parallel tool calls, so JSON tool calling works well for that. We thus choose a `JsonAgent`. +- Web browsing is a single-timeline task that does not require parallel tool calls, so JSON tool calling works well for that. We thus choose a `ToolCallingAgent`. - Also, since sometimes web search requires exploring many pages before finding the correct answer, we prefer to increase the number of `max_steps` to 10. ```py @@ -127,7 +124,6 @@ from smolagents import ( CodeAgent, ToolCallingAgent, HfApiModel, - ManagedAgent, DuckDuckGoSearchTool, LiteLLMModel, ) @@ -138,20 +134,14 @@ web_agent = ToolCallingAgent( tools=[DuckDuckGoSearchTool(), visit_webpage], model=model, max_steps=10, -) -``` - -We then wrap this agent into a `ManagedAgent` that will make it callable by its manager agent. - -```py -managed_web_agent = ManagedAgent( - agent=web_agent, name="search", description="Runs web searches for you. Give it your query as an argument.", ) ``` -Finally we create a manager agent, and upon initialization we pass our managed agent to it in its `managed_agents` argument. +Note that we gave this agent attributes `name` and `description`, mandatory attributes to make this agent callable by its manager agent. + +Then we create a manager agent, and upon initialization we pass our managed agent to it in its `managed_agents` argument. Since this agent is the one tasked with the planning and thinking, advanced reasoning will be beneficial, so a `CodeAgent` will be the best choice. @@ -161,7 +151,7 @@ Also, we want to ask a question that involves the current year and does addition manager_agent = CodeAgent( tools=[], model=model, - managed_agents=[managed_web_agent], + managed_agents=[web_agent], additional_authorized_imports=["time", "numpy", "pandas"], ) ``` diff --git a/docs/source/en/examples/rag.md b/docs/source/en/examples/rag.md deleted file mode 100644 index 46ae7b785..000000000 --- a/docs/source/en/examples/rag.md +++ /dev/null @@ -1,156 +0,0 @@ - -# Agentic RAG - -[[open-in-colab]] - -Retrieval-Augmented-Generation (RAG) is “using an LLM to answer a user query, but basing the answer on information retrieved from a knowledge base”. It has many advantages over using a vanilla or fine-tuned LLM: to name a few, it allows to ground the answer on true facts and reduce confabulations, it allows to provide the LLM with domain-specific knowledge, and it allows fine-grained control of access to information from the knowledge base. - -But vanilla RAG has limitations, most importantly these two: -- It performs only one retrieval step: if the results are bad, the generation in turn will be bad. -- Semantic similarity is computed with the user query as a reference, which might be suboptimal: for instance, the user query will often be a question and the document containing the true answer will be in affirmative voice, so its similarity score will be downgraded compared to other source documents in the interrogative form, leading to a risk of missing the relevant information. - -We can alleviate these problems by making a RAG agent: very simply, an agent armed with a retriever tool! - -This agent will: ✅ Formulate the query itself and ✅ Critique to re-retrieve if needed. - -So it should naively recover some advanced RAG techniques! -- Instead of directly using the user query as the reference in semantic search, the agent formulates itself a reference sentence that can be closer to the targeted documents, as in [HyDE](https://huggingface.co/papers/2212.10496). -The agent can use the generated snippets and re-retrieve if needed, as in [Self-Query](https://docs.llamaindex.ai/en/stable/examples/evaluation/RetryQuery/). - -Let's build this system. 🛠️ - -Run the line below to install required dependencies: -```bash -!pip install smolagents pandas langchain langchain-community sentence-transformers rank_bm25 --upgrade -q -``` -To call the HF Inference API, you will need a valid token as your environment variable `HF_TOKEN`. -We use python-dotenv to load it. -```py -from dotenv import load_dotenv -load_dotenv() -``` - -We first load a knowledge base on which we want to perform RAG: this dataset is a compilation of the documentation pages for many Hugging Face libraries, stored as markdown. We will keep only the documentation for the `transformers` library. - -Then prepare the knowledge base by processing the dataset and storing it into a vector database to be used by the retriever. - -We use [LangChain](https://python.langchain.com/docs/introduction/) for its excellent vector database utilities. - -```py -import datasets -from langchain.docstore.document import Document -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.retrievers import BM25Retriever - -knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train") -knowledge_base = knowledge_base.filter(lambda row: row["source"].startswith("huggingface/transformers")) - -source_docs = [ - Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) - for doc in knowledge_base -] - -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=50, - add_start_index=True, - strip_whitespace=True, - separators=["\n\n", "\n", ".", " ", ""], -) -docs_processed = text_splitter.split_documents(source_docs) -``` - -Now the documents are ready. - -So let’s build our agentic RAG system! - -👉 We only need a RetrieverTool that our agent can leverage to retrieve information from the knowledge base. - -Since we need to add a vectordb as an attribute of the tool, we cannot simply use the simple tool constructor with a `@tool` decorator: so we will follow the advanced setup highlighted in the [tools tutorial](../tutorials/tools). - -```py -from smolagents import Tool - -class RetrieverTool(Tool): - name = "retriever" - description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query." - inputs = { - "query": { - "type": "string", - "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", - } - } - output_type = "string" - - def __init__(self, docs, **kwargs): - super().__init__(**kwargs) - self.retriever = BM25Retriever.from_documents( - docs, k=10 - ) - - def forward(self, query: str) -> str: - assert isinstance(query, str), "Your search query must be a string" - - docs = self.retriever.invoke( - query, - ) - return "\nRetrieved documents:\n" + "".join( - [ - f"\n\n===== Document {str(i)} =====\n" + doc.page_content - for i, doc in enumerate(docs) - ] - ) - -retriever_tool = RetrieverTool(docs_processed) -``` -We have used BM25, a classic retrieval method, because it's lightning fast to setup. -To improve retrieval accuracy, you could use replace BM25 with semantic search using vector representations for documents: thus you can head to the [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) to select a good embedding model. - -Now it’s straightforward to create an agent that leverages this `retriever_tool`! - -The agent will need these arguments upon initialization: -- `tools`: a list of tools that the agent will be able to call. -- `model`: the LLM that powers the agent. -Our `model` must be a callable that takes as input a list of messages and returns text. It also needs to accept a stop_sequences argument that indicates when to stop its generation. For convenience, we directly use the HfEngine class provided in the package to get a LLM engine that calls Hugging Face's Inference API. - -And we use [meta-llama/Llama-3.3-70B-Instruct](meta-llama/Llama-3.3-70B-Instruct) as the llm engine because: -- It has a long 128k context, which is helpful for processing long source documents -- It is served for free at all times on HF's Inference API! - -_Note:_ The Inference API hosts models based on various criteria, and deployed models may be updated or replaced without prior notice. Learn more about it [here](https://huggingface.co/docs/api-inference/supported-models). - -```py -from smolagents import HfApiModel, CodeAgent - -agent = CodeAgent( - tools=[retriever_tool], model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbosity_level=2 -) -``` - -Upon initializing the CodeAgent, it has been automatically given a default system prompt that tells the LLM engine to process step-by-step and generate tool calls as code snippets, but you could replace this prompt template with your own as needed. - -Then when its `.run()` method is launched, the agent takes care of calling the LLM engine, and executing the tool calls, all in a loop that ends only when tool `final_answer` is called with the final answer as its argument. - -```py -agent_output = agent.run("For a transformers model training, which is slower, the forward or the backward pass?") - -print("Final output:") -print(agent_output) -``` - - - diff --git a/docs/source/zh/examples/rag.md b/docs/source/en/examples/rag.mdx similarity index 91% rename from docs/source/zh/examples/rag.md rename to docs/source/en/examples/rag.mdx index acbdf14f6..eb1c4c27f 100644 --- a/docs/source/zh/examples/rag.md +++ b/docs/source/en/examples/rag.mdx @@ -35,7 +35,7 @@ Let's build this system. 🛠️ Run the line below to install required dependencies: ```bash -!pip install smolagents pandas langchain langchain-community sentence-transformers rank_bm25 --upgrade -q +!pip install smolagents pandas langchain langchain-community sentence-transformers datasets python-dotenv rank_bm25 --upgrade -q ``` To call the HF Inference API, you will need a valid token as your environment variable `HF_TOKEN`. We use python-dotenv to load it. @@ -127,20 +127,15 @@ The agent will need these arguments upon initialization: - `model`: the LLM that powers the agent. Our `model` must be a callable that takes as input a list of messages and returns text. It also needs to accept a stop_sequences argument that indicates when to stop its generation. For convenience, we directly use the HfEngine class provided in the package to get a LLM engine that calls Hugging Face's Inference API. -And we use [meta-llama/Llama-3.3-70B-Instruct](meta-llama/Llama-3.3-70B-Instruct) as the llm engine because: -- It has a long 128k context, which is helpful for processing long source documents -- It is served for free at all times on HF's Inference API! - -_Note:_ The Inference API hosts models based on various criteria, and deployed models may be updated or replaced without prior notice. Learn more about it [here](https://huggingface.co/docs/api-inference/supported-models). +>[!NOTE] To use a specific model, pass it like this: `HfApiModel("meta-llama/Llama-3.3-70B-Instruct")`. The Inference API hosts models based on various criteria, and deployed models may be updated or replaced without prior notice. Learn more about it [here](https://huggingface.co/docs/api-inference/supported-models). ```py from smolagents import HfApiModel, CodeAgent agent = CodeAgent( - tools=[retriever_tool], model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbose=True + tools=[retriever_tool], model=HfApiModel(), max_steps=4, verbosity_level=2 ) ``` - Upon initializing the CodeAgent, it has been automatically given a default system prompt that tells the LLM engine to process step-by-step and generate tool calls as code snippets, but you could replace this prompt template with your own as needed. Then when its `.run()` method is launched, the agent takes care of calling the LLM engine, and executing the tool calls, all in a loop that ends only when tool `final_answer` is called with the final answer as its argument. diff --git a/docs/source/en/examples/text_to_sql.md b/docs/source/en/examples/text_to_sql.mdx similarity index 91% rename from docs/source/en/examples/text_to_sql.md rename to docs/source/en/examples/text_to_sql.mdx index 86c5091f4..600d8d95c 100644 --- a/docs/source/en/examples/text_to_sql.md +++ b/docs/source/en/examples/text_to_sql.mdx @@ -27,7 +27,18 @@ A standard text-to-sql pipeline is brittle, since the generated SQL query can be Let’s build this agent! 💪 -First, we setup the SQL environment: +Run the line below to install required dependencies: +```bash +!pip install smolagents python-dotenv sqlalchemy --upgrade -q +``` +To call the HF Inference API, you will need a valid token as your environment variable `HF_TOKEN`. +We use python-dotenv to load it. +```py +from dotenv import load_dotenv +load_dotenv() +``` + +Then, we setup the SQL environment: ```py from sqlalchemy import ( create_engine, @@ -45,7 +56,12 @@ from sqlalchemy import ( engine = create_engine("sqlite:///:memory:") metadata_obj = MetaData() -# create city SQL table +def insert_rows_into_table(rows, table, engine=engine): + for row in rows: + stmt = insert(table).values(**row) + with engine.begin() as connection: + connection.execute(stmt) + table_name = "receipts" receipts = Table( table_name, @@ -63,10 +79,7 @@ rows = [ {"receipt_id": 3, "customer_name": "Woodrow Wilson", "price": 53.43, "tip": 5.43}, {"receipt_id": 4, "customer_name": "Margaret James", "price": 21.11, "tip": 1.00}, ] -for row in rows: - stmt = insert(receipts).values(**row) - with engine.begin() as connection: - cursor = connection.execute(stmt) +insert_rows_into_table(rows, receipts) ``` ### Build our agent @@ -144,7 +157,7 @@ So let’s make a second table recording the names of waiters for each receipt_i ```py table_name = "waiters" -receipts = Table( +waiters = Table( table_name, metadata_obj, Column("receipt_id", Integer, primary_key=True), @@ -158,10 +171,7 @@ rows = [ {"receipt_id": 3, "waiter_name": "Michael Watts"}, {"receipt_id": 4, "waiter_name": "Margaret James"}, ] -for row in rows: - stmt = insert(receipts).values(**row) - with engine.begin() as connection: - cursor = connection.execute(stmt) +insert_rows_into_table(rows, waiters) ``` Since we changed the table, we update the `SQLExecutorTool` with this table’s description to let the LLM properly leverage information from this table. diff --git a/docs/source/en/examples/web_browser.mdx b/docs/source/en/examples/web_browser.mdx new file mode 100644 index 000000000..fe2fc67de --- /dev/null +++ b/docs/source/en/examples/web_browser.mdx @@ -0,0 +1,213 @@ +# Web Browser Automation with Agents 🤖🌐 + +[[open-in-colab]] + +In this notebook, we'll create an **agent-powered web browser automation system**! This system can navigate websites, interact with elements, and extract information automatically. + +The agent will be able to: + +- [x] Navigate to web pages +- [x] Click on elements +- [x] Search within pages +- [x] Handle popups and modals +- [x] Extract information + +Let's set up this system step by step! + +First, run these lines to install the required dependencies: + +```bash +pip install smolagents selenium helium pillow -q +``` + +Let's import our required libraries and set up environment variables: + +```python +from io import BytesIO +from time import sleep + +import helium +from dotenv import load_dotenv +from PIL import Image +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys + +from smolagents import CodeAgent, tool +from smolagents.agents import ActionStep + +# Load environment variables +load_dotenv() +``` + +Now let's create our core browser interaction tools that will allow our agent to navigate and interact with web pages: + +```python +@tool +def search_item_ctrl_f(text: str, nth_result: int = 1) -> str: + """ + Searches for text on the current page via Ctrl + F and jumps to the nth occurrence. + Args: + text: The text to search for + nth_result: Which occurrence to jump to (default: 1) + """ + elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]") + if nth_result > len(elements): + raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)") + result = f"Found {len(elements)} matches for '{text}'." + elem = elements[nth_result - 1] + driver.execute_script("arguments[0].scrollIntoView(true);", elem) + result += f"Focused on element {nth_result} of {len(elements)}" + return result + +@tool +def go_back() -> None: + """Goes back to previous page.""" + driver.back() + +@tool +def close_popups() -> str: + """ + Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! + This does not work on cookie consent banners. + """ + webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform() +``` + +Let's set up our browser with Chrome and configure screenshot capabilities: + +```python +# Configure Chrome options +chrome_options = webdriver.ChromeOptions() +chrome_options.add_argument("--force-device-scale-factor=1") +chrome_options.add_argument("--window-size=1000,1350") +chrome_options.add_argument("--disable-pdf-viewer") +chrome_options.add_argument("--window-position=0,0") + +# Initialize the browser +driver = helium.start_chrome(headless=False, options=chrome_options) + +# Set up screenshot callback +def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: + sleep(1.0) # Let JavaScript animations happen before taking the screenshot + driver = helium.get_driver() + current_step = memory_step.step_number + if driver is not None: + for previous_memory_step in agent.memory.steps: # Remove previous screenshots for lean processing + if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2: + previous_memory_step.observations_images = None + png_bytes = driver.get_screenshot_as_png() + image = Image.open(BytesIO(png_bytes)) + print(f"Captured a browser screenshot: {image.size} pixels") + memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists + + # Update observations with current URL + url_info = f"Current url: {driver.current_url}" + memory_step.observations = ( + url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info + ) +``` + +Now let's create our web automation agent: + +```python +from smolagents import HfApiModel + +# Initialize the model +model_id = "meta-llama/Llama-3.3-70B-Instruct" # You can change this to your preferred model +model = HfApiModel(model_id) + +# Create the agent +agent = CodeAgent( + tools=[go_back, close_popups, search_item_ctrl_f], + model=model, + additional_authorized_imports=["helium"], + step_callbacks=[save_screenshot], + max_steps=20, + verbosity_level=2, +) + +# Import helium for the agent +agent.python_executor("from helium import *", agent.state) +``` + +The agent needs instructions on how to use Helium for web automation. Here are the instructions we'll provide: + +```python +helium_instructions = """ +You can use helium to access websites. Don't bother about the helium driver, it's already managed. +We've already ran "from helium import *" +Then you can go to pages! +Code: +```py +go_to('github.com/trending') +``` + +You can directly click clickable elements by inputting the text that appears on them. +Code: +```py +click("Top products") +``` + +If it's a link: +Code: +```py +click(Link("Top products")) +``` + +If you try to interact with an element and it's not found, you'll get a LookupError. +In general stop your action after each button click to see what happens on your screenshot. +Never try to login in a page. + +To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from. +Code: +```py +scroll_down(num_pixels=1200) # This will scroll one viewport down +``` + +When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails). +Just use your built-in tool `close_popups` to close them: +Code: +```py +close_popups() +``` + +You can use .exists() to check for the existence of an element. For example: +Code: +```py +if Text('Accept cookies?').exists(): + click('I accept') +``` +""" +``` + +Now we can run our agent with a task! Let's try finding information on Wikipedia: + +```python +search_request = """ +Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident. +""" + +agent_output = agent.run(search_request + helium_instructions) +print("Final output:") +print(agent_output) +``` + +You can run different tasks by modifying the request. For example, here's for me to know if I should work harder: + +```python +github_request = """ +I'm trying to find how hard I have to work to get a repo in github.com/trending. +Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year? +""" + +agent_output = agent.run(github_request + helium_instructions) +print("Final output:") +print(agent_output) +``` + +The system is particularly effective for tasks like: +- Data extraction from websites +- Web research automation +- UI testing and verification +- Content monitoring \ No newline at end of file diff --git a/docs/source/en/guided_tour.md b/docs/source/en/guided_tour.mdx similarity index 87% rename from docs/source/en/guided_tour.md rename to docs/source/en/guided_tour.mdx index aebb4e23e..5eca7fc21 100644 --- a/docs/source/en/guided_tour.md +++ b/docs/source/en/guided_tour.mdx @@ -25,18 +25,19 @@ To initialize a minimal agent, you need at least these two arguments: - `model`, a text-generation model to power your agent - because the agent is different from a simple LLM, it is a system that uses a LLM as its engine. You can use any of these options: - [`TransformersModel`] takes a pre-initialized `transformers` pipeline to run inference on your local machine using `transformers`. - - [`HfApiModel`] leverages a `huggingface_hub.InferenceClient` under the hood. - - [`LiteLLMModel`] lets you call 100+ different models through [LiteLLM](https://docs.litellm.ai/)! + - [`HfApiModel`] leverages a `huggingface_hub.InferenceClient` under the hood and supports all Inference Providers on the Hub. + - [`LiteLLMModel`] similarly lets you call 100+ different models and providers through [LiteLLM](https://docs.litellm.ai/)! - [`AzureOpenAIServerModel`] allows you to use OpenAI models deployed in [Azure](https://azure.microsoft.com/en-us/products/ai-services/openai-service). + - [`MLXModel`] creates a [mlx-lm](https://pypi.org/project/mlx-lm/) pipeline to run inference on your local machine. - `tools`, a list of `Tools` that the agent can use to solve the task. It can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`. -Once you have these two arguments, `tools` and `model`, you can create an agent and run it. You can use any LLM you'd like, either through [Hugging Face API](https://huggingface.co/docs/api-inference/en/index), [transformers](https://github.com/huggingface/transformers/), [ollama](https://ollama.com/), [LiteLLM](https://www.litellm.ai/), or [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service). +Once you have these two arguments, `tools` and `model`, you can create an agent and run it. You can use any LLM you'd like, either through [Inference Providers](https://huggingface.co/blog/inference-providers), [transformers](https://github.com/huggingface/transformers/), [ollama](https://ollama.com/), [LiteLLM](https://www.litellm.ai/), [Azure OpenAI](https://azure.microsoft.com/en-us/products/ai-services/openai-service), or [mlx-lm](https://pypi.org/project/mlx-lm/). - + -Hugging Face API is free to use without a token, but then it will have a rate limitation. +HF Inference API is free to use without a token, but then it will have a rate limit. To access gated models or rise your rate limits with a PRO account, you need to set the environment variable `HF_TOKEN` or pass `token` variable upon initialization of `HfApiModel`. You can get your token from your [settings page](https://huggingface.co/settings/tokens) @@ -46,6 +47,7 @@ from smolagents import CodeAgent, HfApiModel model_id = "meta-llama/Llama-3.3-70B-Instruct" model = HfApiModel(model_id=model_id, token="") # You can choose to not pass any model_id to HfApiModel to use a default free model +# you can also specify a particular provider e.g. provider="together" or provider="sambanova" agent = CodeAgent(tools=[], model=model, add_base_tools=True) agent.run( @@ -94,8 +96,8 @@ from smolagents import CodeAgent, LiteLLMModel model = LiteLLMModel( model_id="ollama_chat/llama3.2", # This model is a bit weak for agentic behaviours though api_base="http://localhost:11434", # replace with 127.0.0.1:11434 or remote open-ai compatible server if necessary - api_key="YOUR_API_KEY" # replace with API key if necessary - num_ctx=8192 # ollama default is 2048 which will fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model. + api_key="YOUR_API_KEY", # replace with API key if necessary + num_ctx=8192, # ollama default is 2048 which will fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model. ) agent = CodeAgent(tools=[], model=model, add_base_tools=True) @@ -147,6 +149,19 @@ agent.run( ) ``` + + + +```python +# !pip install smolagents[mlx-lm] +from smolagents import CodeAgent, MLXModel + +mlx_model = MLXModel("mlx-community/Qwen2.5-Coder-32B-Instruct-4bit") +agent = CodeAgent(model=mlx_model, tools=[], add_base_tools=True) + +agent.run("Could you give me the 118th number in the Fibonacci sequence?") +``` + @@ -189,7 +204,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co Here are a few useful attributes to inspect what happened after a run: - `agent.logs` stores the fine-grained logs of the agent. At every step of the agent's run, everything gets stored in a dictionary that then is appended to `agent.logs`. -- Running `agent.write_inner_memory_from_logs()` creates an inner memory of the agent's logs for the LLM to view, as a list of chat messages. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method. +- Running `agent.write_memory_to_messages()` writes the agent's memory as list of chat messages for the Model to view. This method goes over each step of the log and only stores what it's interested in as a message: for instance, it will save the system prompt and task in separate messages, then for each step it will store the LLM output as a message, and the tool call output as another message. Use this if you want a higher-level view of what has happened - but not every log will be transcripted by this method. ## Tools @@ -205,7 +220,7 @@ When the agent is initialized, the tool attributes are used to generate a tool d ### Default toolbox -Transformers comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`: +`smolagents` comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools = True`: - **DuckDuckGo web search***: performs a web search using DuckDuckGo browser. - **Python code interpreter**: runs your LLM generated Python code in a secure environment. This tool will only be added to [`ToolCallingAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code @@ -343,25 +358,25 @@ It empirically yields better performance on most benchmarks. The reason for this You can easily build hierarchical multi-agent systems with `smolagents`. -To do so, encapsulate the agent in a [`ManagedAgent`] object. This object needs arguments `agent`, `name`, and a `description`, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools. +To do so, just ensure your agent has `name` and`description` attributes, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools. +Then you can pass this managed agent in the parameter managed_agents upon initialization of the manager agent. Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]: ```py -from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool, ManagedAgent +from smolagents import CodeAgent, HfApiModel, DuckDuckGoSearchTool model = HfApiModel() -web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) - -managed_web_agent = ManagedAgent( - agent=web_agent, +web_agent = CodeAgent( + tools=[DuckDuckGoSearchTool()], + model=model, name="web_search", description="Runs web searches for you. Give it your query as an argument." ) manager_agent = CodeAgent( - tools=[], model=model, managed_agents=[managed_web_agent] + tools=[], model=model, managed_agents=[web_agent] ) manager_agent.run("Who is the CEO of Hugging Face?") @@ -401,6 +416,17 @@ You can also use this `reset=False` argument to keep the conversation going in a ## Next steps +Finally, when you've configured your agent to your needs, you can share it to the Hub! + +```py +agent.push_to_hub("m-ric/my_agent") +``` + +Similarly, to load an agent that has been pushed to hub, if you trust the code from its tools, use: +```py +agent.from_hub("m-ric/my_agent", trust_remote_code=True) +``` + For more in-depth usage, you will then want to check out our tutorials: - [the explanation of how our code agents work](./tutorials/secure_code_execution) - [this guide on how to build good agents](./tutorials/building_good_agents). diff --git a/docs/source/en/index.md b/docs/source/en/index.mdx similarity index 96% rename from docs/source/en/index.md rename to docs/source/en/index.mdx index 90f5c7845..14f80ff5b 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.mdx @@ -25,7 +25,7 @@ This library offers: ✨ **Simplicity**: the logic for agents fits in ~thousand lines of code. We kept abstractions to their minimal shape above raw code! -🌐 **Support for any LLM**: it supports models hosted on the Hub loaded in their `transformers` version or through our inference API, but also models from OpenAI, Anthropic... it's really easy to power an agent with any LLM. +🌐 **Support for any LLM**: it supports models hosted on the Hub loaded in their `transformers` version or through our inference API and Inference providers, but also models from OpenAI, Anthropic... it's really easy to power an agent with any LLM. 🧑‍💻 **First-class support for Code Agents**, i.e. agents that write their actions in code (as opposed to "agents being used to write code"), [read more here](tutorials/secure_code_execution). diff --git a/docs/source/en/reference/agents.mdx b/docs/source/en/reference/agents.mdx new file mode 100644 index 000000000..a6f57183e --- /dev/null +++ b/docs/source/en/reference/agents.mdx @@ -0,0 +1,69 @@ + +# Agents + + + +Smolagents is an experimental API which is subject to change at any time. Results returned by the agents +can vary as the APIs or underlying models are prone to change. + + + +To learn more about agents and tools make sure to read the [introductory guide](../index). This page +contains the API docs for the underlying classes. + +## Agents + +Our agents inherit from [`MultiStepAgent`], which means they can act in multiple steps, each step consisting of one thought, then one tool call and execution. Read more in [this conceptual guide](../conceptual_guides/react). + +We provide two types of agents, based on the main [`Agent`] class. + - [`CodeAgent`] is the default agent, it writes its tool calls in Python code. + - [`ToolCallingAgent`] writes its tool calls in JSON. + +Both require arguments `model` and list of tools `tools` at initialization. + +### Classes of agents + +[[autodoc]] MultiStepAgent + +[[autodoc]] CodeAgent + +[[autodoc]] ToolCallingAgent + +### ManagedAgent + +_This class is deprecated since 1.8.0: now you simply need to pass attributes `name` and `description` to a normal agent to make it callable by a manager agent._ + +### stream_to_gradio + +[[autodoc]] stream_to_gradio + +### GradioUI + +> [!TIP] +> You must have `gradio` installed to use the UI. Please run `pip install smolagents[gradio]` if it's not the case. + +[[autodoc]] GradioUI + +## Prompts + +[[autodoc]] smolagents.agents.PromptTemplates + +[[autodoc]] smolagents.agents.PlanningPromptTemplate + +[[autodoc]] smolagents.agents.ManagedAgentPromptTemplate + +[[autodoc]] smolagents.agents.FinalAnswerPromptTemplate diff --git a/docs/source/en/reference/agents.md b/docs/source/en/reference/models.mdx similarity index 78% rename from docs/source/en/reference/agents.md rename to docs/source/en/reference/models.mdx index 77a0df176..2a7f8f45d 100644 --- a/docs/source/en/reference/agents.md +++ b/docs/source/en/reference/models.mdx @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. rendered properly in your Markdown viewer. --> -# Agents +# Models @@ -25,39 +25,6 @@ can vary as the APIs or underlying models are prone to change. To learn more about agents and tools make sure to read the [introductory guide](../index). This page contains the API docs for the underlying classes. -## Agents - -Our agents inherit from [`MultiStepAgent`], which means they can act in multiple steps, each step consisting of one thought, then one tool call and execution. Read more in [this conceptual guide](../conceptual_guides/react). - -We provide two types of agents, based on the main [`Agent`] class. - - [`CodeAgent`] is the default agent, it writes its tool calls in Python code. - - [`ToolCallingAgent`] writes its tool calls in JSON. - -Both require arguments `model` and list of tools `tools` at initialization. - -### Classes of agents - -[[autodoc]] MultiStepAgent - -[[autodoc]] CodeAgent - -[[autodoc]] ToolCallingAgent - -### ManagedAgent - -[[autodoc]] ManagedAgent - -### stream_to_gradio - -[[autodoc]] stream_to_gradio - -### GradioUI - -> [!TIP] -> You must have `gradio` installed to use the UI. Please run `pip install smolagents[gradio]` if it's not the case. - -[[autodoc]] GradioUI - ## Models You're free to create and use your own models to power your agent. @@ -94,7 +61,7 @@ from smolagents import TransformersModel model = TransformersModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct") -print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])) +print(model([{"role": "user", "content": [{"type": "text", "text": "Ok!"}]}], stop_sequences=["great"])) ``` ```text >>> What a @@ -107,15 +74,13 @@ print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])) ### HfApiModel -The `HfApiModel` wraps an [HF Inference API](https://huggingface.co/docs/api-inference/index) client for the execution of the LLM. +The `HfApiModel` wraps huggingface_hub's [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/guides/inference) for the execution of the LLM. It supports both HF's own [Inference API](https://huggingface.co/docs/api-inference/index) as well as all [Inference Providers](https://huggingface.co/blog/inference-providers) available on the Hub. ```python from smolagents import HfApiModel messages = [ - {"role": "user", "content": "Hello, how are you?"}, - {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, - {"role": "user", "content": "No need to help, take it easy."}, + {"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]} ] model = HfApiModel() @@ -135,9 +100,7 @@ You can pass kwargs upon model initialization that will then be used whenever us from smolagents import LiteLLMModel messages = [ - {"role": "user", "content": "Hello, how are you?"}, - {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, - {"role": "user", "content": "No need to help, take it easy."}, + {"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]} ] model = LiteLLMModel("anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10) @@ -151,6 +114,7 @@ print(model(messages)) This class lets you call any OpenAIServer compatible model. Here's how you can set it (you can customise the `api_base` url to point to another server): ```py +import os from smolagents import OpenAIServerModel model = OpenAIServerModel( @@ -183,4 +147,23 @@ model = AzureOpenAIServerModel( ) ``` -[[autodoc]] AzureOpenAIServerModel \ No newline at end of file +[[autodoc]] AzureOpenAIServerModel + +### MLXModel + + +```python +from smolagents import MLXModel + +model = MLXModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct") + +print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])) +``` +```text +>>> What a +``` + +> [!TIP] +> You must have `mlx-lm` installed on your machine. Please run `pip install smolagents[mlx-lm]` if it's not the case. + +[[autodoc]] MLXModel diff --git a/docs/source/en/reference/tools.md b/docs/source/en/reference/tools.mdx similarity index 89% rename from docs/source/en/reference/tools.md rename to docs/source/en/reference/tools.mdx index 9d787740c..68c70b897 100644 --- a/docs/source/en/reference/tools.md +++ b/docs/source/en/reference/tools.mdx @@ -49,17 +49,29 @@ contains the API docs for the underlying classes. [[autodoc]] PythonInterpreterTool +### FinalAnswerTool + +[[autodoc]] FinalAnswerTool + +### UserInputTool + +[[autodoc]] UserInputTool + ### DuckDuckGoSearchTool [[autodoc]] DuckDuckGoSearchTool +### GoogleSearchTool + +[[autodoc]] GoogleSearchTool + ### VisitWebpageTool [[autodoc]] VisitWebpageTool -### UserInputTool +### SpeechToTextTool -[[autodoc]] UserInputTool +[[autodoc]] SpeechToTextTool ## ToolCollection @@ -84,12 +96,12 @@ These types have three specific purposes: ### AgentText -[[autodoc]] smolagents.types.AgentText +[[autodoc]] smolagents.agent_types.AgentText ### AgentImage -[[autodoc]] smolagents.types.AgentImage +[[autodoc]] smolagents.agent_types.AgentImage ### AgentAudio -[[autodoc]] smolagents.types.AgentAudio +[[autodoc]] smolagents.agent_types.AgentAudio diff --git a/docs/source/en/tutorials/building_good_agents.md b/docs/source/en/tutorials/building_good_agents.mdx similarity index 98% rename from docs/source/en/tutorials/building_good_agents.md rename to docs/source/en/tutorials/building_good_agents.mdx index bc5165605..8c17de1af 100644 --- a/docs/source/en/tutorials/building_good_agents.md +++ b/docs/source/en/tutorials/building_good_agents.mdx @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. [[open-in-colab]] There's a world of difference between building an agent that works and one that doesn't. -How can we build agents that fall into the latter category? +How can we build agents that fall into the former category? In this guide, we're going to talk about best practices for building agents. > [!TIP] @@ -194,7 +194,7 @@ If above clarifications are not sufficient, you can change the system prompt. Let's see how it works. For example, let us check the default system prompt for the [`CodeAgent`] (below version is shortened by skipping zero-shot examples). ```python -print(agent.system_prompt_template) +print(agent.prompt_templates["system_prompt"]) ``` Here is what you get: ```text @@ -243,15 +243,7 @@ So while you can overwrite this system prompt template by passing your custom pr Then you can change the system prompt as follows: ```py -from smolagents.prompts import CODE_SYSTEM_PROMPT - -modified_system_prompt = CODE_SYSTEM_PROMPT + "\nHere you go!" # Change the system prompt here - -agent = CodeAgent( - tools=[], - model=HfApiModel(), - system_prompt=modified_system_prompt -) +agent.prompt_templates["system_prompt"] = agent.prompt_templates["system_prompt"] + "\nHere you go!" ``` This also works with the [`ToolCallingAgent`]. diff --git a/docs/source/en/tutorials/inspect_runs.md b/docs/source/en/tutorials/inspect_runs.md deleted file mode 100644 index 1fef9be55..000000000 --- a/docs/source/en/tutorials/inspect_runs.md +++ /dev/null @@ -1,112 +0,0 @@ - -# Inspecting runs with OpenTelemetry - -[[open-in-colab]] - -> [!TIP] -> If you're new to building agents, make sure to first read the [intro to agents](../conceptual_guides/intro_agents) and the [guided tour of smolagents](../guided_tour). - -### Why log your agent runs? - -Agent runs are complicated to debug. - -Validating that a run went properly is hard, since agent workflows are [unpredictable by design](../conceptual_guides/intro_agents) (if they were predictable, you'd just be using good old code). - -And inspecting a run is hard as well: multi-step agents tend to quickly fill a console with logs, and most of the errors are just "LLM dumb" kind of errors, from which the LLM auto-corrects in the next step by writing better code or tool calls. - -So using instrumentation to record agent runs is necessary in production for later inspection and monitoring! - -We've adopted the [OpenTelemetry](https://opentelemetry.io/) standard for instrumenting agent runs. - -This means that you can just run some instrumentation code, then run your agents normally, and everything gets logged into your platform. - -Here's how it then looks like on the platform: - -
- -
- - -### Setting up telemetry with Arize AI Phoenix -First install the required packages. Here we install [Phoenix by Arize AI](https://github.com/Arize-ai/phoenix) because that's a good solution to collect and inspect the logs, but there are other OpenTelemetry-compatible platforms that you could use for this collection & inspection part. - -```shell -pip install smolagents -pip install arize-phoenix opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents -``` - -Then run the collector in the background. - -```shell -python -m phoenix.server.main serve -``` - -Finally, set up `SmolagentsInstrumentor` to trace your agents and send the traces to Phoenix at the endpoint defined below. - -```python -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor - -from openinference.instrumentation.smolagents import SmolagentsInstrumentor -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor - -endpoint = "http://0.0.0.0:6006/v1/traces" -trace_provider = TracerProvider() -trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter(endpoint))) - -SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) -``` -Then you can run your agents! - -```py -from smolagents import ( - CodeAgent, - ToolCallingAgent, - ManagedAgent, - DuckDuckGoSearchTool, - VisitWebpageTool, - HfApiModel, -) - -model = HfApiModel() - -agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], - model=model, -) -managed_agent = ManagedAgent( - agent=agent, - name="managed_agent", - description="This is an agent that can do web search.", -) -manager_agent = CodeAgent( - tools=[], - model=model, - managed_agents=[managed_agent], -) -manager_agent.run( - "If the US keeps its 2024 growth rate, how many years will it take for the GDP to double?" -) -``` -Voilà! -You can then navigate to `http://0.0.0.0:6006/projects/` to inspect your run! - - - -You can see that the CodeAgent called its managed ToolCallingAgent (by the way, the managed agent could be have been a CodeAgent as well) to ask it to run the web search for the U.S. 2024 growth rate. Then the managed agent returned its report and the manager agent acted upon it to calculate the economy doubling time! Sweet, isn't it? \ No newline at end of file diff --git a/docs/source/en/tutorials/inspect_runs.mdx b/docs/source/en/tutorials/inspect_runs.mdx new file mode 100644 index 000000000..4ade8427b --- /dev/null +++ b/docs/source/en/tutorials/inspect_runs.mdx @@ -0,0 +1,193 @@ + +# Inspecting runs with OpenTelemetry + +[[open-in-colab]] + +> [!TIP] +> If you're new to building agents, make sure to first read the [intro to agents](../conceptual_guides/intro_agents) and the [guided tour of smolagents](../guided_tour). + +## Why log your agent runs? + +Agent runs are complicated to debug. + +Validating that a run went properly is hard, since agent workflows are [unpredictable by design](../conceptual_guides/intro_agents) (if they were predictable, you'd just be using good old code). + +And inspecting a run is hard as well: multi-step agents tend to quickly fill a console with logs, and most of the errors are just "LLM dumb" kind of errors, from which the LLM auto-corrects in the next step by writing better code or tool calls. + +So using instrumentation to record agent runs is necessary in production for later inspection and monitoring! + +We've adopted the [OpenTelemetry](https://opentelemetry.io/) standard for instrumenting agent runs. + +This means that you can just run some instrumentation code, then run your agents normally, and everything gets logged into your platform. Below are some examples of how to do this with different OpenTelemetry backends. + +Here's how it then looks like on the platform: + +
+ +
+ + +## Setting up telemetry with Arize AI Phoenix +First install the required packages. Here we install [Phoenix by Arize AI](https://github.com/Arize-ai/phoenix) because that's a good solution to collect and inspect the logs, but there are other OpenTelemetry-compatible platforms that you could use for this collection & inspection part. + +```shell +pip install 'smolagents[telemetry]' +``` + +Then run the collector in the background. + +```shell +python -m phoenix.server.main serve +``` + +Finally, set up `SmolagentsInstrumentor` to trace your agents and send the traces to Phoenix default endpoint. + +```python +from phoenix.otel import register +from openinference.instrumentation.smolagents import SmolagentsInstrumentor + +register() +SmolagentsInstrumentor().instrument() +``` +Then you can run your agents! + +```py +from smolagents import ( + CodeAgent, + ToolCallingAgent, + DuckDuckGoSearchTool, + VisitWebpageTool, + HfApiModel, +) + +model = HfApiModel() + +search_agent = ToolCallingAgent( + tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + model=model, + name="search_agent", + description="This is an agent that can do web search.", +) + +manager_agent = CodeAgent( + tools=[], + model=model, + managed_agents=[search_agent], +) +manager_agent.run( + "If the US keeps its 2024 growth rate, how many years will it take for the GDP to double?" +) +``` +Voilà! +You can then navigate to `http://0.0.0.0:6006/projects/` to inspect your run! + + + +You can see that the CodeAgent called its managed ToolCallingAgent (by the way, the managed agent could have been a CodeAgent as well) to ask it to run the web search for the U.S. 2024 growth rate. Then the managed agent returned its report and the manager agent acted upon it to calculate the economy doubling time! Sweet, isn't it? + +## Setting up telemetry with Langfuse + +This part shows how to monitor and debug your Hugging Face **smolagents** with **Langfuse** using the `SmolagentsInstrumentor`. + +> **What is Langfuse?** [Langfuse](https://langfuse.com) is an open-source platform for LLM engineering. It provides tracing and monitoring capabilities for AI agents, helping developers debug, analyze, and optimize their products. Langfuse integrates with various tools and frameworks via native integrations, OpenTelemetry, and SDKs. + +### Step 1: Install Dependencies + +```python +%pip install smolagents +%pip install opentelemetry-sdk opentelemetry-exporter-otlp openinference-instrumentation-smolagents +``` + +### Step 2: Set Up Environment Variables + +Set your Langfuse API keys and configure the OpenTelemetry endpoint to send traces to Langfuse. Get your Langfuse API keys by signing up for [Langfuse Cloud](https://cloud.langfuse.com) or [self-hosting Langfuse](https://langfuse.com/self-hosting). + +Also, add your [Hugging Face token](https://huggingface.co/settings/tokens) (`HF_TOKEN`) as an environment variable. + +```python +import os +import base64 + +LANGFUSE_PUBLIC_KEY="pk-lf-..." +LANGFUSE_SECRET_KEY="sk-lf-..." +LANGFUSE_AUTH=base64.b64encode(f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()).decode() + +os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel" # EU data region +# os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://us.cloud.langfuse.com/api/public/otel" # US data region +os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}" + +# your Hugging Face token +os.environ["HF_TOKEN"] = "hf_..." +``` + +### Step 3: Initialize the `SmolagentsInstrumentor` + +Initialize the `SmolagentsInstrumentor` before your application code. Configure `tracer_provider` and add a span processor to export traces to Langfuse. `OTLPSpanExporter()` uses the endpoint and headers from the environment variables. + + +```python +from opentelemetry.sdk.trace import TracerProvider + +from openinference.instrumentation.smolagents import SmolagentsInstrumentor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace.export import SimpleSpanProcessor + +trace_provider = TracerProvider() +trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter())) + +SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) +``` + +### Step 4: Run your smolagent + +```python +from smolagents import ( + CodeAgent, + ToolCallingAgent, + DuckDuckGoSearchTool, + VisitWebpageTool, + HfApiModel, +) + +model = HfApiModel( + model_id="deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" +) + +search_agent = ToolCallingAgent( + tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + model=model, + name="search_agent", + description="This is an agent that can do web search.", +) + +manager_agent = CodeAgent( + tools=[], + model=model, + managed_agents=[search_agent], +) +manager_agent.run( + "How can Langfuse be used to monitor and improve the reasoning and decision-making of smolagents when they execute multi-step tasks, like dynamically adjusting a recipe based on user feedback or available ingredients?" +) +``` + +### Step 5: View Traces in Langfuse + +After running the agent, you can view the traces generated by your smolagents application in [Langfuse](https://cloud.langfuse.com). You should see detailed steps of the LLM interactions, which can help you debug and optimize your AI agent. + +![smolagents example trace](https://langfuse.com/images/cookbook/integration-smolagents/smolagent_example_trace.png) + +_[Public example trace in Langfuse](https://cloud.langfuse.com/project/cloramnkj0002jz088vzn1ja4/traces/ce5160f9bfd5a6cd63b07d2bfcec6f54?timestamp=2025-02-11T09%3A25%3A45.163Z&display=details)_ diff --git a/docs/source/en/tutorials/memory.mdx b/docs/source/en/tutorials/memory.mdx new file mode 100644 index 000000000..0732d9596 --- /dev/null +++ b/docs/source/en/tutorials/memory.mdx @@ -0,0 +1,148 @@ + +# 📚 Manage your agent's memory + +[[open-in-colab]] + +In the end, an agent can be defined by simple components: it has tools, prompts. +And most importantly, it has a memory of past steps, drawing a history of planning, execution, and errors. + +### Replay your agent's memory + +We propose several features to inspect a past agent run. + +You can instrument the agent's run to display it in a great UI that lets you zoom in/out on specific steps, as highlighted in the [instrumentation guide](./inspect_runs). + +You can also use `agent.replay()`, as follows: + +After the agent has run: +```py +from smolagents import HfApiModel, CodeAgent + +agent = CodeAgent(tools=[], model=HfApiModel(), verbosity_level=0) + +result = agent.run("What's the 20th Fibonacci number?") +``` + +If you want to replay this last run, just use: +```py +agent.replay() +``` + +### Dynamically change the agent's memory + +Many advanced use cases require dynamic modification of the agent's memory. + +You can access the agent's memory using: + +```py +from smolagents import ActionStep + +system_prompt_step = agent.memory.system_prompt +print("The system prompt given to the agent was:") +print(system_prompt_step.system_prompt) + +task_step = agent.memory.steps[0] +print("\n\nThe first task step was:") +print(task_step.task) + +for step in agent.memory.steps: + if isinstance(step, ActionStep): + if step.error is not None: + print(f"\nStep {step.step_number} got this error:\n{step.error}\n") + else: + print(f"\nStep {step.step_number} got these observations:\n{step.observations}\n") +``` + +Use `agent.memory.get_full_steps()` to get full steps as dictionaries. + +You can also use step callbacks to dynamically change the agent's memory. + +Step callbacks can access the `agent` itself in their arguments, so they can access any memory step as highlighted above, and change it if needed. For instance, let's say you are observing screenshots of each step performed by a web browser agent. You want to log the newest screenshot, and remove the images from ancient steps to save on token costs. + +You culd run something like the following. +_Note: this code is incomplete, some imports and object definitions have been removed for the sake of concision, visit [the original script](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py) to get the full working code._ + +```py +import helium +from PIL import Image +from io import BytesIO +from time import sleep + +def update_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: + sleep(1.0) # Let JavaScript animations happen before taking the screenshot + driver = helium.get_driver() + latest_step = memory_step.step_number + for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing + if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= latest_step - 2: + previous_memory_step.observations_images = None + png_bytes = driver.get_screenshot_as_png() + image = Image.open(BytesIO(png_bytes)) + memory_step.observations_images = [image.copy()] +``` + +Then you should pass this function in the `step_callbacks` argument upon initialization of your agent: + +```py +CodeAgent( + tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], + model=model, + additional_authorized_imports=["helium"], + step_callbacks=[update_screenshot], + max_steps=20, + verbosity_level=2, +) +``` + +Head to our [vision web browser code](https://github.com/huggingface/smolagents/blob/main/src/smolagents/vision_web_browser.py) to see the full working example. + +### Run agents one step at a time + +This can be useful in case you have tool calls that take days: you can just run your agents step by step. +This will also let you update the memory on each step. + +```py +from smolagents import HfApiModel, CodeAgent, ActionStep, TaskStep + +agent = CodeAgent(tools=[], model=HfApiModel(), verbosity_level=1) +print(agent.memory.system_prompt) + +task = "What is the 20th Fibonacci number?" + +# You could modify the memory as needed here by inputting the memory of another agent. +# agent.memory.steps = previous_agent.memory.steps + +# Let's start a new task! +agent.memory.steps.append(TaskStep(task=task, task_images=[])) + +final_answer = None +step_number = 1 +while final_answer is None and step_number <= 10: + memory_step = ActionStep( + step_number=step_number, + observations_images=[], + ) + # Run one step. + final_answer = agent.step(memory_step) + agent.memory.steps.append(memory_step) + step_number += 1 + + # Change the memory as you please! + # For instance to update the latest step: + # agent.memory.steps[-1] = ... + +print("The final answer is:", final_answer) +``` \ No newline at end of file diff --git a/docs/source/en/tutorials/secure_code_execution.md b/docs/source/en/tutorials/secure_code_execution.mdx similarity index 100% rename from docs/source/en/tutorials/secure_code_execution.md rename to docs/source/en/tutorials/secure_code_execution.mdx diff --git a/docs/source/en/tutorials/tools.md b/docs/source/en/tutorials/tools.mdx similarity index 100% rename from docs/source/en/tutorials/tools.md rename to docs/source/en/tutorials/tools.mdx diff --git a/docs/source/hi/conceptual_guides/intro_agents.md b/docs/source/hi/conceptual_guides/intro_agents.mdx similarity index 100% rename from docs/source/hi/conceptual_guides/intro_agents.md rename to docs/source/hi/conceptual_guides/intro_agents.mdx diff --git a/docs/source/hi/conceptual_guides/react.md b/docs/source/hi/conceptual_guides/react.mdx similarity index 91% rename from docs/source/hi/conceptual_guides/react.md rename to docs/source/hi/conceptual_guides/react.mdx index c36f17cfe..0f17901e8 100644 --- a/docs/source/hi/conceptual_guides/react.md +++ b/docs/source/hi/conceptual_guides/react.mdx @@ -42,6 +42,3 @@ ReAct प्रक्रिया में पिछले चरणों क हम दो प्रकार के ToolCallingAgent को लागू करते हैं: - [`ToolCallingAgent`] अपने आउटपुट में टूल कॉल को JSON के रूप में जनरेट करता है। - [`CodeAgent`] ToolCallingAgent का एक नया प्रकार है जो अपने टूल कॉल को कोड के ब्लॉब्स के रूप में जनरेट करता है, जो उन LLM के लिए वास्तव में अच्छी तरह काम करता है जिनका कोडिंग प्रदर्शन मजबूत है। - -> [!TIP] -> हम एजेंट्स को वन-शॉट में चलाने का विकल्प भी प्रदान करते हैं: बस एजेंट को लॉन्च करते समय `single_step=True` पास करें, जैसे `agent.run(your_task, single_step=True)` \ No newline at end of file diff --git a/docs/source/hi/examples/multiagents.md b/docs/source/hi/examples/multiagents.mdx similarity index 99% rename from docs/source/hi/examples/multiagents.md rename to docs/source/hi/examples/multiagents.mdx index 33056c8ba..1e9fcc745 100644 --- a/docs/source/hi/examples/multiagents.md +++ b/docs/source/hi/examples/multiagents.mdx @@ -119,7 +119,7 @@ print(visit_webpage("https://en.wikipedia.org/wiki/Hugging_Face")[:500]) अब जब हमारे पास सभी टूल्स `search` और `visit_webpage` हैं, हम उनका उपयोग वेब एजेंट बनाने के लिए कर सकते हैं। इस एजेंट के लिए कौन सा कॉन्फ़िगरेशन चुनें? -- वेब ब्राउज़िंग एक सिंगल-टाइमलाइन टास्क है जिसे समानांतर टूल कॉल की आवश्यकता नहीं है, इसलिए JSON टूल कॉलिंग इसके लिए अच्छी तरह काम करती है। इसलिए हम `JsonAgent` चुनते हैं। +- वेब ब्राउज़िंग एक सिंगल-टाइमलाइन टास्क है जिसे समानांतर टूल कॉल की आवश्यकता नहीं है, इसलिए JSON टूल कॉलिंग इसके लिए अच्छी तरह काम करती है। इसलिए हम `ToolCallingAgent` चुनते हैं। - साथ ही, चूंकि कभी-कभी वेब सर्च में सही उत्तर खोजने से पहले कई पेजों की सर्च करने की आवश्यकता होती है, हम `max_steps` को बढ़ाकर 10 करना पसंद करते हैं। ```py diff --git a/docs/source/hi/examples/rag.md b/docs/source/hi/examples/rag.mdx similarity index 100% rename from docs/source/hi/examples/rag.md rename to docs/source/hi/examples/rag.mdx diff --git a/docs/source/hi/examples/text_to_sql.md b/docs/source/hi/examples/text_to_sql.mdx similarity index 100% rename from docs/source/hi/examples/text_to_sql.md rename to docs/source/hi/examples/text_to_sql.mdx diff --git a/docs/source/hi/guided_tour.md b/docs/source/hi/guided_tour.mdx similarity index 96% rename from docs/source/hi/guided_tour.md rename to docs/source/hi/guided_tour.mdx index 24cb71d03..745b6643a 100644 --- a/docs/source/hi/guided_tour.md +++ b/docs/source/hi/guided_tour.mdx @@ -142,7 +142,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co रन के बाद क्या हुआ यह जांचने के लिए यहाँ कुछ उपयोगी एट्रिब्यूट्स हैं: - `agent.logs` एजेंट के फाइन-ग्रेन्ड लॉग्स को स्टोर करता है। एजेंट के रन के हर स्टेप पर, सब कुछ एक डिक्शनरी में स्टोर किया जाता है जो फिर `agent.logs` में जोड़ा जाता है। -- `agent.write_inner_memory_from_logs()` चलाने से LLM के लिए एजेंट के लॉग्स की एक इनर मेमोरी बनती है, चैट मैसेज की लिस्ट के रूप में। यह मेथड लॉग के प्रत्येक स्टेप पर जाता है और केवल वही स्टोर करता है जिसमें यह एक मैसेज के रूप में रुचि रखता है: उदाहरण के लिए, यह सिस्टम प्रॉम्प्ट और टास्क को अलग-अलग मैसेज के रूप में सेव करेगा, फिर प्रत्येक स्टेप के लिए यह LLM आउटपुट को एक मैसेज के रूप में और टूल कॉल आउटपुट को दूसरे मैसेज के रूप में स्टोर करेगा। +- `agent.write_memory_to_messages()` चलाने से LLM के लिए एजेंट के लॉग्स की एक इनर मेमोरी बनती है, चैट मैसेज की लिस्ट के रूप में। यह मेथड लॉग के प्रत्येक स्टेप पर जाता है और केवल वही स्टोर करता है जिसमें यह एक मैसेज के रूप में रुचि रखता है: उदाहरण के लिए, यह सिस्टम प्रॉम्प्ट और टास्क को अलग-अलग मैसेज के रूप में सेव करेगा, फिर प्रत्येक स्टेप के लिए यह LLM आउटपुट को एक मैसेज के रूप में और टूल कॉल आउटपुट को दूसरे मैसेज के रूप में स्टोर करेगा। ## टूल्स @@ -158,7 +158,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co ### डिफ़ॉल्ट टूलबॉक्स -Transformers एजेंट्स को सशक्त बनाने के लिए एक डिफ़ॉल्ट टूलबॉक्स के साथ आता है, जिसे आप आर्ग्यूमेंट `add_base_tools = True` के साथ अपने एजेंट में इनिशियलाइजेशन पर जोड़ सकते हैं: +`smolagents` एजेंट्स को सशक्त बनाने के लिए एक डिफ़ॉल्ट टूलबॉक्स के साथ आता है, जिसे आप आर्ग्यूमेंट `add_base_tools = True` के साथ अपने एजेंट में इनिशियलाइजेशन पर जोड़ सकते हैं: - **DuckDuckGo वेब सर्च**: DuckDuckGo ब्राउज़र का उपयोग करके वेब सर्च करता है। - **पायथन कोड इंटरप्रेटर**: आपका LLM जनरेटेड पायथन कोड एक सुरक्षित एनवायरनमेंट में चलाता है। यह टूल [`ToolCallingAgent`] में केवल तभी जोड़ा जाएगा जब आप इसे `add_base_tools=True` के साथ इनिशियलाइज़ करते हैं, क्योंकि कोड-बेस्ड एजेंट पहले से ही नेटिव रूप से पायथन कोड एक्जीक्यूट कर सकता है diff --git a/docs/source/hi/index.md b/docs/source/hi/index.mdx similarity index 100% rename from docs/source/hi/index.md rename to docs/source/hi/index.mdx diff --git a/docs/source/hi/reference/agents.md b/docs/source/hi/reference/agents.mdx similarity index 95% rename from docs/source/hi/reference/agents.md rename to docs/source/hi/reference/agents.mdx index 11b461e79..2e070cf03 100644 --- a/docs/source/hi/reference/agents.md +++ b/docs/source/hi/reference/agents.mdx @@ -42,10 +42,9 @@ Agents और tools के बारे में अधिक जानने [[autodoc]] ToolCallingAgent - ### ManagedAgent -[[autodoc]] ManagedAgent +_This class is deprecated since 1.8.0: now you just need to pass name and description attributes to an agent to directly use it as previously done with a ManagedAgent._ ### stream_to_gradio @@ -146,6 +145,7 @@ print(model(messages)) यह क्लास आपको किसी भी OpenAIServer कम्पैटिबल मॉडल को कॉल करने देती है। यहाँ बताया गया है कि आप इसे कैसे सेट कर सकते हैं (आप दूसरे सर्वर को पॉइंट करने के लिए `api_base` url को कस्टमाइज़ कर सकते हैं): ```py +import os from smolagents import OpenAIServerModel model = OpenAIServerModel( @@ -153,4 +153,14 @@ model = OpenAIServerModel( api_base="https://api.openai.com/v1", api_key=os.environ["OPENAI_API_KEY"], ) -``` \ No newline at end of file +``` + +## Prompts + +[[autodoc]] smolagents.agents.PromptTemplates + +[[autodoc]] smolagents.agents.PlanningPromptTemplate + +[[autodoc]] smolagents.agents.ManagedAgentPromptTemplate + +[[autodoc]] smolagents.agents.FinalAnswerPromptTemplate diff --git a/docs/source/hi/reference/tools.md b/docs/source/hi/reference/tools.mdx similarity index 97% rename from docs/source/hi/reference/tools.md rename to docs/source/hi/reference/tools.mdx index ddb24d1ab..6c270321e 100644 --- a/docs/source/hi/reference/tools.md +++ b/docs/source/hi/reference/tools.mdx @@ -80,12 +80,12 @@ Smolagents एक experimental API है जो किसी भी समय ### AgentText -[[autodoc]] smolagents.types.AgentText +[[autodoc]] smolagents.agent_types.AgentText ### AgentImage -[[autodoc]] smolagents.types.AgentImage +[[autodoc]] smolagents.agent_types.AgentImage ### AgentAudio -[[autodoc]] smolagents.types.AgentAudio +[[autodoc]] smolagents.agent_types.AgentAudio diff --git a/docs/source/hi/tutorials/building_good_agents.md b/docs/source/hi/tutorials/building_good_agents.mdx similarity index 99% rename from docs/source/hi/tutorials/building_good_agents.md rename to docs/source/hi/tutorials/building_good_agents.mdx index 86eee273c..92587ef35 100644 --- a/docs/source/hi/tutorials/building_good_agents.md +++ b/docs/source/hi/tutorials/building_good_agents.mdx @@ -195,7 +195,7 @@ Final answer: आइए देखें कि यह कैसे काम करता है। उदाहरण के लिए, आइए [`CodeAgent`] के लिए डिफ़ॉल्ट सिस्टम प्रॉम्प्ट की जाँच करें (नीचे दिया गया वर्जन जीरो-शॉट उदाहरणों को छोड़कर छोटा किया गया है)। ```python -print(agent.system_prompt_template) +print(agent.prompt_templates["system_prompt"]) ``` Here is what you get: ```text @@ -244,15 +244,7 @@ Now Begin! If you solve the task correctly, you will receive a reward of $1,000, फिर आप सिस्टम प्रॉम्प्ट को निम्नानुसार बदल सकते हैं: ```py -from smolagents.prompts import CODE_SYSTEM_PROMPT - -modified_system_prompt = CODE_SYSTEM_PROMPT + "\nHere you go!" # Change the system prompt here - -agent = CodeAgent( - tools=[], - model=HfApiModel(), - system_prompt=modified_system_prompt -) +agent.prompt_templates["system_prompt"] = agent.prompt_templates["system_prompt"] + "\nHere you go!" ``` This also works with the [`ToolCallingAgent`]. diff --git a/docs/source/hi/tutorials/inspect_runs.md b/docs/source/hi/tutorials/inspect_runs.mdx similarity index 98% rename from docs/source/hi/tutorials/inspect_runs.md rename to docs/source/hi/tutorials/inspect_runs.mdx index db85fc755..0669c4dcc 100644 --- a/docs/source/hi/tutorials/inspect_runs.md +++ b/docs/source/hi/tutorials/inspect_runs.mdx @@ -71,7 +71,6 @@ SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) from smolagents import ( CodeAgent, ToolCallingAgent, - ManagedAgent, DuckDuckGoSearchTool, VisitWebpageTool, HfApiModel, @@ -79,15 +78,13 @@ from smolagents import ( model = HfApiModel() -agent = ToolCallingAgent( +managed_agent = ToolCallingAgent( tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], model=model, -) -managed_agent = ManagedAgent( - agent=agent, name="managed_agent", description="This is an agent that can do web search.", ) + manager_agent = CodeAgent( tools=[], model=model, diff --git a/docs/source/hi/tutorials/secure_code_execution.md b/docs/source/hi/tutorials/secure_code_execution.mdx similarity index 100% rename from docs/source/hi/tutorials/secure_code_execution.md rename to docs/source/hi/tutorials/secure_code_execution.mdx diff --git a/docs/source/hi/tutorials/tools.md b/docs/source/hi/tutorials/tools.mdx similarity index 100% rename from docs/source/hi/tutorials/tools.md rename to docs/source/hi/tutorials/tools.mdx diff --git a/docs/source/zh/conceptual_guides/intro_agents.md b/docs/source/zh/conceptual_guides/intro_agents.mdx similarity index 100% rename from docs/source/zh/conceptual_guides/intro_agents.md rename to docs/source/zh/conceptual_guides/intro_agents.mdx diff --git a/docs/source/zh/conceptual_guides/react.md b/docs/source/zh/conceptual_guides/react.mdx similarity index 93% rename from docs/source/zh/conceptual_guides/react.md rename to docs/source/zh/conceptual_guides/react.mdx index 24428e03f..cdb970728 100644 --- a/docs/source/zh/conceptual_guides/react.md +++ b/docs/source/zh/conceptual_guides/react.mdx @@ -42,6 +42,3 @@ ReAct 过程涉及保留过去步骤的记忆。 我们实现了两个版本的 ToolCallingAgent: - [`ToolCallingAgent`] 在其输出中生成 JSON 格式的工具调用。 - [`CodeAgent`] 是一种新型的 ToolCallingAgent,它生成代码块形式的工具调用,这对于具有强大编码性能的 LLM 非常有效。 - -> [!TIP] -> 我们还提供了一个选项来以单步模式运行 agent:只需在启动 agent 时传递 `single_step=True`,例如 `agent.run(your_task, single_step=True)` \ No newline at end of file diff --git a/docs/source/zh/examples/multiagents.md b/docs/source/zh/examples/multiagents.mdx similarity index 90% rename from docs/source/zh/examples/multiagents.md rename to docs/source/zh/examples/multiagents.mdx index 67eed890e..3b177d133 100644 --- a/docs/source/zh/examples/multiagents.md +++ b/docs/source/zh/examples/multiagents.mdx @@ -120,7 +120,7 @@ print(visit_webpage("https://en.wikipedia.org/wiki/Hugging_Face")[:500]) 现在我们有了所有工具`search`和`visit_webpage`,我们可以使用它们来创建web agent。 我们该选取什么样的配置来构建这个agent呢? -- 网页浏览是一个单线程任务,不需要并行工具调用,因此JSON工具调用对于这个任务非常有效。因此我们选择`JsonAgent`。 +- 网页浏览是一个单线程任务,不需要并行工具调用,因此JSON工具调用对于这个任务非常有效。因此我们选择`ToolCallingAgent`。 - 有时候网页搜索需要探索许多页面才能找到正确答案,所以我们更喜欢将 `max_steps` 增加到10。 ```py @@ -139,26 +139,24 @@ web_agent = ToolCallingAgent( tools=[DuckDuckGoSearchTool(), visit_webpage], model=model, max_steps=10, -) -``` - -然后我们将这个agent封装到一个`ManagedAgent`中,使其可以被其管理的agent调用。 - -```py -managed_web_agent = ManagedAgent( - agent=web_agent, name="search", description="Runs web searches for you. Give it your query as an argument.", ) ``` -最后,我们创建一个manager agent,在初始化时将我们的managed agent传递给它的`managed_agents`参数。因为这个agent负责计划和思考,所以高级推理将是有益的,因此`CodeAgent`将是最佳选择。此外,我们想要问一个涉及当前年份的问题,并进行额外的数据计算:因此让我们添加`additional_authorized_imports=["time", "numpy", "pandas"]`,以防agent需要这些包。 +请注意,我们为这个代理赋予了 name(名称)和 description(描述)属性,这些是必需属性,以便让管理代理能够调用此代理。 + +然后,我们创建一个管理代理,在初始化时,将受管代理作为 managed_agents 参数传递给它。 + +由于这个代理的任务是进行规划和思考,高级推理能力会很有帮助,因此 CodeAgent(代码代理)将是最佳选择。 + +此外,我们要提出一个涉及当前年份并需要进行额外数据计算的问题:所以让我们添加 additional_authorized_imports=["time", "numpy", "pandas"],以防代理需要用到这些包。 ```py manager_agent = CodeAgent( tools=[], model=model, - managed_agents=[managed_web_agent], + managed_agents=[web_agent], additional_authorized_imports=["time", "numpy", "pandas"], ) ``` diff --git a/docs/source/zh/examples/rag.mdx b/docs/source/zh/examples/rag.mdx new file mode 100644 index 000000000..23efa9e0e --- /dev/null +++ b/docs/source/zh/examples/rag.mdx @@ -0,0 +1,143 @@ + +# Agentic RAG + +[[open-in-colab]] + +Retrieval-Augmented-Generation (RAG) 是“使用大语言模型(LLM)来回答用户查询,但基于从知识库中检索的信息”。它比使用普通或微调的 LLM 具有许多优势:举几个例子,它允许将答案基于真实事实并减少虚构;它允许提供 LLM 领域特定的知识;并允许对知识库中的信息访问进行精细控制。 + +但是,普通的 RAG 存在一些局限性,以下两点尤为突出: + +- 它只执行一次检索步骤:如果结果不好,生成的内容也会不好。 +- 语义相似性是以用户查询为参考计算的,这可能不是最优的:例如,用户查询通常是一个问题,而包含真实答案的文档通常是肯定语态,因此其相似性得分会比其他以疑问形式呈现的源文档低,从而导致错失相关信息的风险。 + +我们可以通过制作一个 RAG agent来缓解这些问题:非常简单,一个配备了检索工具的agent!这个 agent 将 +会:✅ 自己构建查询和检索,✅ 如果需要的话会重新检索。 + +因此,它将比普通 RAG 更智能,因为它可以自己构建查询,而不是直接使用用户查询作为参考。这样,它可以更 +接近目标文档,从而提高检索的准确性, [HyDE](https://huggingface.co/papers/2212.10496)。此 agent 可以 +使用生成的片段,并在需要时重新检索,就像 [Self-Query](https://docs.llamaindex.ai/en/stable/examples/evaluation/RetryQuery/)。 + +我们现在开始构建这个系统. 🛠️ + +运行以下代码以安装所需的依赖包: +```bash +!pip install smolagents pandas langchain langchain-community sentence-transformers rank_bm25 --upgrade -q +``` + +你需要一个有效的 token 作为环境变量 `HF_TOKEN` 来调用 HF Inference API。我们使用 python-dotenv 来加载它。 +```py +from dotenv import load_dotenv +load_dotenv() +``` + +我们首先加载一个知识库以在其上执行 RAG:此数据集是许多 Hugging Face 库的文档页面的汇编,存储为 markdown 格式。我们将仅保留 `transformers` 库的文档。然后通过处理数据集并将其存储到向量数据库中,为检索器准备知识库。我们将使用 [LangChain](https://python.langchain.com/docs/introduction/) 来利用其出色的向量数据库工具。 +```py +import datasets +from langchain.docstore.document import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.retrievers import BM25Retriever + +knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train") +knowledge_base = knowledge_base.filter(lambda row: row["source"].startswith("huggingface/transformers")) + +source_docs = [ + Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) + for doc in knowledge_base +] + +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=500, + chunk_overlap=50, + add_start_index=True, + strip_whitespace=True, + separators=["\n\n", "\n", ".", " ", ""], +) +docs_processed = text_splitter.split_documents(source_docs) +``` + +现在文档已准备好。我们来一起构建我们的 agent RAG 系统! +👉 我们只需要一个 RetrieverTool,我们的 agent 可以利用它从知识库中检索信息。 + +由于我们需要将 vectordb 添加为工具的属性,我们不能简单地使用带有 `@tool` 装饰器的简单工具构造函数:因此我们将遵循 [tools 教程](../tutorials/tools) 中突出显示的高级设置。 + +```py +from smolagents import Tool + +class RetrieverTool(Tool): + name = "retriever" + description = "Uses semantic search to retrieve the parts of transformers documentation that could be most relevant to answer your query." + inputs = { + "query": { + "type": "string", + "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", + } + } + output_type = "string" + + def __init__(self, docs, **kwargs): + super().__init__(**kwargs) + self.retriever = BM25Retriever.from_documents( + docs, k=10 + ) + + def forward(self, query: str) -> str: + assert isinstance(query, str), "Your search query must be a string" + + docs = self.retriever.invoke( + query, + ) + return "\nRetrieved documents:\n" + "".join( + [ + f"\n\n===== Document {str(i)} =====\n" + doc.page_content + for i, doc in enumerate(docs) + ] + ) + +retriever_tool = RetrieverTool(docs_processed) +``` +BM25 检索方法是一个经典的检索方法,因为它的设置速度非常快。为了提高检索准确性,你可以使用语义搜索,使用文档的向量表示替换 BM25:因此你可以前往 [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) 选择一个好的嵌入模型。 + +现在我们已经创建了一个可以从知识库中检索信息的工具,现在我们可以很容易地创建一个利用这个 +`retriever_tool` 的 agent!此 agent 将使用如下参数初始化: +- `tools`:代理将能够调用的工具列表。 +- `model`:为代理提供动力的 LLM。 + +我们的 `model` 必须是一个可调用对象,它接受一个消息的 list 作为输入,并返回文本。它还需要接受一个 stop_sequences 参数,指示何时停止生成。为了方便起见,我们直接使用包中提供的 `HfEngine` 类来获取调用 Hugging Face 的 Inference API 的 LLM 引擎。 + +接着,我们将使用 [meta-llama/Llama-3.3-70B-Instruct](meta-llama/Llama-3.3-70B-Instruct) 作为 llm 引 +擎,因为: +- 它有一个长 128k 上下文,这对处理长源文档很有用。 +- 它在 HF 的 Inference API 上始终免费提供! + +_Note:_ 此 Inference API 托管基于各种标准的模型,部署的模型可能会在没有事先通知的情况下进行更新或替换。了解更多信息,请点击[这里](https://huggingface.co/docs/api-inference/supported-models)。 + +```py +from smolagents import HfApiModel, CodeAgent + +agent = CodeAgent( + tools=[retriever_tool], model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"), max_steps=4, verbose=True +) +``` + +当我们初始化 CodeAgent 时,它已经自动获得了一个默认的系统提示,告诉 LLM 引擎按步骤处理并生成工具调用作为代码片段,但你可以根据需要替换此提示模板。接着,当其 `.run()` 方法被调用时,代理将负责调用 LLM 引擎,并在循环中执行工具调用,直到工具 `final_answer` 被调用,而其参数为最终答案。 + +```py +agent_output = agent.run("For a transformers model training, which is slower, the forward or the backward pass?") + +print("Final output:") +print(agent_output) +``` diff --git a/docs/source/zh/examples/text_to_sql.md b/docs/source/zh/examples/text_to_sql.mdx similarity index 64% rename from docs/source/zh/examples/text_to_sql.md rename to docs/source/zh/examples/text_to_sql.mdx index 12d0c5e47..419c45159 100644 --- a/docs/source/zh/examples/text_to_sql.md +++ b/docs/source/zh/examples/text_to_sql.mdx @@ -17,17 +17,17 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -In this tutorial, we’ll see how to implement an agent that leverages SQL using `smolagents`. +在此教程中,我们将看到如何使用 `smolagents` 实现一个利用 SQL 的 agent。 -> Let's start with the golden question: why not keep it simple and use a standard text-to-SQL pipeline? +> 让我们从经典问题开始:为什么不简单地使用标准的 text-to-SQL pipeline 呢? -A standard text-to-sql pipeline is brittle, since the generated SQL query can be incorrect. Even worse, the query could be incorrect, but not raise an error, instead giving some incorrect/useless outputs without raising an alarm. +标准的 text-to-SQL pipeline 很脆弱,因为生成的 SQL 查询可能会出错。更糟糕的是,查询可能出错却不引发错误警报,从而返回一些不正确或无用的结果。 -👉 Instead, an agent system is able to critically inspect outputs and decide if the query needs to be changed or not, thus giving it a huge performance boost. +👉 相反,agent 系统则可以检视输出结果并决定查询是否需要被更改,因此带来巨大的性能提升。 -Let’s build this agent! 💪 +让我们来一起构建这个 agent! 💪 -First, we setup the SQL environment: +首先,我们构建一个 SQL 的环境: ```py from sqlalchemy import ( create_engine, @@ -69,11 +69,9 @@ for row in rows: cursor = connection.execute(stmt) ``` -### Build our agent +### 构建 agent -Now let’s make our SQL table retrievable by a tool. - -The tool’s description attribute will be embedded in the LLM’s prompt by the agent system: it gives the LLM information about how to use the tool. This is where we want to describe the SQL table. +现在,我们构建一个 agent,它将使用 SQL 查询来回答问题。工具的 description 属性将被 agent 系统嵌入到 LLM 的提示中:它为 LLM 提供有关如何使用该工具的信息。这正是我们描述 SQL 表的地方。 ```py inspector = inspect(engine) @@ -91,9 +89,10 @@ Columns: - tip: FLOAT ``` -Now let’s build our tool. It needs the following: (read [the tool doc](../tutorials/tools) for more detail) -- A docstring with an `Args:` part listing arguments. -- Type hints on both inputs and output. +现在让我们构建我们的工具。它需要以下内容:(更多细节请参阅[工具文档](../tutorials/tools)) + +- 一个带有 `Args:` 部分列出参数的 docstring。 +- 输入和输出的type hints。 ```py from smolagents import tool @@ -120,11 +119,9 @@ def sql_engine(query: str) -> str: return output ``` -Now let us create an agent that leverages this tool. +我们现在使用这个工具来创建一个 agent。我们使用 `CodeAgent`,这是 smolagent 的主要 agent 类:一个在代码中编写操作并根据 ReAct 框架迭代先前输出的 agent。 -We use the `CodeAgent`, which is smolagents’ main agent class: an agent that writes actions in code and can iterate on previous output according to the ReAct framework. - -The model is the LLM that powers the agent system. HfApiModel allows you to call LLMs using HF’s Inference API, either via Serverless or Dedicated endpoint, but you could also use any proprietary API. +这个模型是驱动 agent 系统的 LLM。`HfApiModel` 允许你使用 HF Inference API 调用 LLM,无论是通过 Serverless 还是 Dedicated endpoint,但你也可以使用任何专有 API。 ```py from smolagents import CodeAgent, HfApiModel @@ -136,11 +133,9 @@ agent = CodeAgent( agent.run("Can you give me the name of the client who got the most expensive receipt?") ``` -### Level 2: Table joins - -Now let’s make it more challenging! We want our agent to handle joins across multiple tables. +### Level 2: 表连接 -So let’s make a second table recording the names of waiters for each receipt_id! +现在让我们增加一些挑战!我们希望我们的 agent 能够处理跨多个表的连接。因此,我们创建一个新表,记录每个 receipt_id 的服务员名字! ```py table_name = "waiters" @@ -163,7 +158,8 @@ for row in rows: with engine.begin() as connection: cursor = connection.execute(stmt) ``` -Since we changed the table, we update the `SQLExecutorTool` with this table’s description to let the LLM properly leverage information from this table. + +因为我们改变了表,我们需要更新 `SQLExecutorTool`,让 LLM 能够正确利用这个表的信息。 ```py updated_description = """Allows you to perform SQL queries on the table. Beware that this tool's output is a string representation of the execution output. @@ -180,7 +176,8 @@ for table in ["receipts", "waiters"]: print(updated_description) ``` -Since this request is a bit harder than the previous one, we’ll switch the LLM engine to use the more powerful [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)! + +因为这个request 比之前的要难一些,我们将 LLM 引擎切换到更强大的 [Qwen/Qwen2.5-Coder-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct)! ```py sql_engine.description = updated_description @@ -192,11 +189,13 @@ agent = CodeAgent( agent.run("Which waiter got more total money from tips?") ``` -It directly works! The setup was surprisingly simple, wasn’t it? -This example is done! We've touched upon these concepts: -- Building new tools. -- Updating a tool's description. -- Switching to a stronger LLM helps agent reasoning. +它直接就能工作!设置过程非常简单,难道不是吗? + +这个例子到此结束!我们涵盖了这些概念: + +- 构建新工具。 +- 更新工具的描述。 +- 切换到更强大的 LLM 有助于 agent 推理。 -✅ Now you can go build this text-to-SQL system you’ve always dreamt of! ✨ \ No newline at end of file +✅ 现在你可以构建你一直梦寐以求的 text-to-SQL 系统了!✨ diff --git a/docs/source/zh/guided_tour.md b/docs/source/zh/guided_tour.mdx similarity index 97% rename from docs/source/zh/guided_tour.md rename to docs/source/zh/guided_tour.mdx index 537e5948e..54ae10419 100644 --- a/docs/source/zh/guided_tour.md +++ b/docs/source/zh/guided_tour.mdx @@ -152,7 +152,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co 以下是一些有用的属性,用于检查运行后发生了什么: - `agent.logs` 存储 agent 的细粒度日志。在 agent 运行的每一步,所有内容都会存储在一个字典中,然后附加到 `agent.logs` 中。 -- 运行 `agent.write_inner_memory_from_logs()` 会为 LLM 创建一个 agent 日志的内部内存,作为聊天消息列表。此方法会遍历日志的每一步,并仅存储它感兴趣的内容作为消息:例如,它会将系统提示和任务存储为单独的消息,然后对于每一步,它会将 LLM 输出存储为一条消息,工具调用输出存储为另一条消息。如果您想要更高级别的视图 - 但不是每个日志都会被此方法转录。 +- 运行 `agent.write_memory_to_messages()` 会为 LLM 创建一个 agent 日志的内部内存,作为聊天消息列表。此方法会遍历日志的每一步,并仅存储它感兴趣的内容作为消息:例如,它会将系统提示和任务存储为单独的消息,然后对于每一步,它会将 LLM 输出存储为一条消息,工具调用输出存储为另一条消息。如果您想要更高级别的视图 - 但不是每个日志都会被此方法转录。 ## 工具 @@ -168,7 +168,7 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co ### 默认工具箱 -Transformers 附带了一个用于增强 agent 的默认工具箱,您可以在初始化时通过参数 `add_base_tools = True` 将其添加到您的 agent 中: +`smolagents` 附带了一个用于增强 agent 的默认工具箱,您可以在初始化时通过参数 `add_base_tools = True` 将其添加到您的 agent 中: - **DuckDuckGo 网页搜索**:使用 DuckDuckGo 浏览器执行网页搜索。 - **Python 代码解释器**:在安全环境中运行 LLM 生成的 Python 代码。只有在使用 `add_base_tools=True` 初始化 [`ToolCallingAgent`] 时才会添加此工具,因为基于代码的 agent 已经可以原生执行 Python 代码 diff --git a/docs/source/zh/index.md b/docs/source/zh/index.mdx similarity index 100% rename from docs/source/zh/index.md rename to docs/source/zh/index.mdx diff --git a/docs/source/zh/reference/agents.md b/docs/source/zh/reference/agents.md deleted file mode 100644 index 3b05a6d28..000000000 --- a/docs/source/zh/reference/agents.md +++ /dev/null @@ -1,149 +0,0 @@ - -# Agents - - - -Smolagents is an experimental API which is subject to change at any time. Results returned by the agents -can vary as the APIs or underlying models are prone to change. - - - -To learn more about agents and tools make sure to read the [introductory guide](../index). This page -contains the API docs for the underlying classes. - -## Agents - -Our agents inherit from [`MultiStepAgent`], which means they can act in multiple steps, each step consisting of one thought, then one tool call and execution. Read more in [this conceptual guide](../conceptual_guides/react). - -We provide two types of agents, based on the main [`Agent`] class. - - [`CodeAgent`] is the default agent, it writes its tool calls in Python code. - - [`ToolCallingAgent`] writes its tool calls in JSON. - -Both require arguments `model` and list of tools `tools` at initialization. - - -### Classes of agents - -[[autodoc]] MultiStepAgent - -[[autodoc]] CodeAgent - -[[autodoc]] ToolCallingAgent - - -### ManagedAgent - -[[autodoc]] ManagedAgent - -### stream_to_gradio - -[[autodoc]] stream_to_gradio - -### GradioUI - -> [!TIP] -> You must have `gradio` installed to use the UI. Please run `pip install smolagents[gradio]` if it's not the case. - -[[autodoc]] GradioUI - -## Models - -You're free to create and use your own models to power your agent. - -You could use any `model` callable for your agent, as long as: -1. It follows the [messages format](./chat_templating) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`. -2. It stops generating outputs *before* the sequences passed in the argument `stop_sequences` - -For defining your LLM, you can make a `custom_model` method which accepts a list of [messages](./chat_templating) and returns text. This callable also needs to accept a `stop_sequences` argument that indicates when to stop generating. - -```python -from huggingface_hub import login, InferenceClient - -login("") - -model_id = "meta-llama/Llama-3.3-70B-Instruct" - -client = InferenceClient(model=model_id) - -def custom_model(messages, stop_sequences=["Task"]) -> str: - response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) - answer = response.choices[0].message.content - return answer -``` - -Additionally, `custom_model` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to model, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs. - -### TransformersModel - -For convenience, we have added a `TransformersModel` that implements the points above by building a local `transformers` pipeline for the model_id given at initialization. - -```python -from smolagents import TransformersModel - -model = TransformersModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct") - -print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])) -``` -```text ->>> What a -``` - -> [!TIP] -> You must have `transformers` and `torch` installed on your machine. Please run `pip install smolagents[transformers]` if it's not the case. - -[[autodoc]] TransformersModel - -### HfApiModel - -The `HfApiModel` wraps an [HF Inference API](https://huggingface.co/docs/api-inference/index) client for the execution of the LLM. - -```python -from smolagents import HfApiModel - -messages = [ - {"role": "user", "content": "Hello, how are you?"}, - {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, - {"role": "user", "content": "No need to help, take it easy."}, -] - -model = HfApiModel() -print(model(messages)) -``` -```text ->>> Of course! If you change your mind, feel free to reach out. Take care! -``` -[[autodoc]] HfApiModel - -### LiteLLMModel - -The `LiteLLMModel` leverages [LiteLLM](https://www.litellm.ai/) to support 100+ LLMs from various providers. -You can pass kwargs upon model initialization that will then be used whenever using the model, for instance below we pass `temperature`. - -```python -from smolagents import LiteLLMModel - -messages = [ - {"role": "user", "content": "Hello, how are you?"}, - {"role": "assistant", "content": "I'm doing great. How can I help you today?"}, - {"role": "user", "content": "No need to help, take it easy."}, -] - -model = LiteLLMModel("anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10) -print(model(messages)) -``` - -[[autodoc]] LiteLLMModel \ No newline at end of file diff --git a/docs/source/zh/reference/agents.mdx b/docs/source/zh/reference/agents.mdx new file mode 100644 index 000000000..bd7f3a779 --- /dev/null +++ b/docs/source/zh/reference/agents.mdx @@ -0,0 +1,68 @@ + + +# Agents(智能体) + + + +Smolagents 是一个实验性的 API,可能会随时发生变化。由于 API 或底层模型可能发生变化,代理返回的结果也可能有所不同。 + + + +要了解有关智能体和工具的更多信息,请务必阅读[入门指南](../index)。本页面包含基础类的 API 文档。 + +## 智能体(Agents) + +我们的智能体继承自 [`MultiStepAgent`],这意味着它们可以执行多步操作,每一步包含一个思考(thought),然后是一个工具调用和执行。请阅读[概念指南](../conceptual_guides/react)以了解更多信息。 + +我们提供两种类型的代理,它们基于主要的 [`Agent`] 类: + - [`CodeAgent`] 是默认代理,它以 Python 代码编写工具调用。 + - [`ToolCallingAgent`] 以 JSON 编写工具调用。 + +两者在初始化时都需要提供参数 `model` 和工具列表 `tools`。 + +### 智能体类 + +[[autodoc]] MultiStepAgent + +[[autodoc]] CodeAgent + +[[autodoc]] ToolCallingAgent + +### ManagedAgent + +_此类自 1.8.0 起已被弃用:现在您只需向普通代理传递 `name` 和 `description` 属性即可使其可被管理代理调用。_ + +### stream_to_gradio + +[[autodoc]] stream_to_gradio + +### GradioUI + +> [!TIP] +> 您必须安装 `gradio` 才能使用 UI。如果尚未安装,请运行 `pip install smolagents[gradio]`。 + +[[autodoc]] GradioUI + +## 提示(Prompts) + +[[autodoc]] smolagents.agents.PromptTemplates + +[[autodoc]] smolagents.agents.PlanningPromptTemplate + +[[autodoc]] smolagents.agents.ManagedAgentPromptTemplate + +[[autodoc]] smolagents.agents.FinalAnswerPromptTemplate diff --git a/docs/source/zh/reference/models.mdx b/docs/source/zh/reference/models.mdx new file mode 100644 index 000000000..79c9e72a4 --- /dev/null +++ b/docs/source/zh/reference/models.mdx @@ -0,0 +1,166 @@ + + +# 模型 + + + +Smolagents 是一个实验性 API,其可能会随时发生更改。由于 API 或底层模型可能会变化,智能体返回的结果可能会有所不同。 + + + +要了解有关智能体和工具的更多信息,请务必阅读[入门指南](../index)。此页面包含底层类的 API 文档。 + +## 模型 + +您可以自由创建和使用自己的模型为智能体提供支持。 + +您可以使用任何 `model` 可调用对象作为智能体的模型,只要满足以下条件: +1. 它遵循[消息格式](./chat_templating)(`List[Dict[str, str]]`),将其作为输入 `messages`,并返回一个 `str`。 +2. 它在生成的序列到达 `stop_sequences` 参数中指定的内容之前停止生成输出。 + +要定义您的 LLM,可以创建一个 `custom_model` 方法,该方法接受一个 [messages](./chat_templating) 列表,并返回一个包含 `.content` 属性的对象,其中包含生成的文本。此可调用对象还需要接受一个 `stop_sequences` 参数,用于指示何时停止生成。 + +```python +from huggingface_hub import login, InferenceClient + +login("") + +model_id = "meta-llama/Llama-3.3-70B-Instruct" + +client = InferenceClient(model=model_id) + +def custom_model(messages, stop_sequences=["Task"]): + response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) + answer = response.choices[0].message + return answer +``` + +此外,`custom_model` 还可以接受一个 `grammar` 参数。如果在智能体初始化时指定了 `grammar`,则此参数将在调用模型时传递,以便进行[约束生成](https://huggingface.co/docs/text-generation-inference/conceptual/guidance),从而强制生成格式正确的智能体输出。 + +### TransformersModel + +为了方便起见,我们添加了一个 `TransformersModel`,该模型通过为初始化时指定的 `model_id` 构建一个本地 `transformers` pipeline 来实现上述功能。 + +```python +from smolagents import TransformersModel + +model = TransformersModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct") + +print(model([{"role": "user", "content": [{"type": "text", "text": "Ok!"}]}], stop_sequences=["great"])) +``` +```text +>>> What a +``` + +> [!TIP] +> 您必须在机器上安装 `transformers` 和 `torch`。如果尚未安装,请运行 `pip install smolagents[transformers]`。 + +[[autodoc]] TransformersModel + +### HfApiModel + +`HfApiModel` 封装了 huggingface_hub 的 [InferenceClient](https://huggingface.co/docs/huggingface_hub/main/en/guides/inference),用于执行 LLM。它支持 HF 的 [Inference API](https://huggingface.co/docs/api-inference/index) 以及 Hub 上所有可用的[Inference Providers](https://huggingface.co/blog/inference-providers)。 + +```python +from smolagents import HfApiModel + +messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]} +] + +model = HfApiModel() +print(model(messages)) +``` +```text +>>> Of course! If you change your mind, feel free to reach out. Take care! +``` +[[autodoc]] HfApiModel + +### LiteLLMModel + +`LiteLLMModel` 利用 [LiteLLM](https://www.litellm.ai/) 支持来自不同提供商的 100+ 个 LLM。您可以在模型初始化时传递 `kwargs`,这些参数将在每次使用模型时被使用,例如下面的示例中传递了 `temperature`。 + +```python +from smolagents import LiteLLMModel + +messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}]} +] + +model = LiteLLMModel("anthropic/claude-3-5-sonnet-latest", temperature=0.2, max_tokens=10) +print(model(messages)) +``` + +[[autodoc]] LiteLLMModel + +### OpenAIServerModel + +此类允许您调用任何 OpenAIServer 兼容模型。 +以下是设置方法(您可以自定义 `api_base` URL 指向其他服务器): +```py +import os +from smolagents import OpenAIServerModel + +model = OpenAIServerModel( + model_id="gpt-4o", + api_base="https://api.openai.com/v1", + api_key=os.environ["OPENAI_API_KEY"], +) +``` + +[[autodoc]] OpenAIServerModel + +### AzureOpenAIServerModel + +`AzureOpenAIServerModel` 允许您连接到任何 Azure OpenAI 部署。 + +下面是设置示例,请注意,如果已经设置了相应的环境变量,您可以省略 `azure_endpoint`、`api_key` 和 `api_version` 参数——环境变量包括 `AZURE_OPENAI_ENDPOINT`、`AZURE_OPENAI_API_KEY` 和 `OPENAI_API_VERSION`。 + +请注意,`OPENAI_API_VERSION` 没有 `AZURE_` 前缀,这是由于底层 [openai](https://github.com/openai/openai-python) 包的设计所致。 + +```py +import os + +from smolagents import AzureOpenAIServerModel + +model = AzureOpenAIServerModel( + model_id = os.environ.get("AZURE_OPENAI_MODEL"), + azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"), + api_key=os.environ.get("AZURE_OPENAI_API_KEY"), + api_version=os.environ.get("OPENAI_API_VERSION") +) +``` + +[[autodoc]] AzureOpenAIServerModel + +### MLXModel + +```python +from smolagents import MLXModel + +model = MLXModel(model_id="HuggingFaceTB/SmolLM-135M-Instruct") + +print(model([{"role": "user", "content": "Ok!"}], stop_sequences=["great"])) +``` +```text +>>> What a +``` + +> [!TIP] +> 您必须在机器上安装 `mlx-lm`。如果尚未安装,请运行 `pip install smolagents[mlx-lm]`。 + +[[autodoc]] MLXModel diff --git a/docs/source/zh/reference/tools.md b/docs/source/zh/reference/tools.md deleted file mode 100644 index 022ad35d2..000000000 --- a/docs/source/zh/reference/tools.md +++ /dev/null @@ -1,91 +0,0 @@ - -# Tools - - - -Smolagents is an experimental API which is subject to change at any time. Results returned by the agents -can vary as the APIs or underlying models are prone to change. - - - -To learn more about agents and tools make sure to read the [introductory guide](../index). This page -contains the API docs for the underlying classes. - -## Tools - -### load_tool - -[[autodoc]] load_tool - -### tool - -[[autodoc]] tool - -### Tool - -[[autodoc]] Tool - -### launch_gradio_demo - -[[autodoc]] launch_gradio_demo - -## Default tools - -### PythonInterpreterTool - -[[autodoc]] PythonInterpreterTool - -### DuckDuckGoSearchTool - -[[autodoc]] DuckDuckGoSearchTool - -### VisitWebpageTool - -[[autodoc]] VisitWebpageTool - -## ToolCollection - -[[autodoc]] ToolCollection - -## Agent Types - -Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return -text, image, audio, video, among other types. In order to increase compatibility between tools, as well as to -correctly render these returns in ipython (jupyter, colab, ipython notebooks, ...), we implement wrapper classes -around these types. - -The wrapped objects should continue behaving as initially; a text object should still behave as a string, an image -object should still behave as a `PIL.Image`. - -These types have three specific purposes: - -- Calling `to_raw` on the type should return the underlying object -- Calling `to_string` on the type should return the object as a string: that can be the string in case of an `AgentText` - but will be the path of the serialized version of the object in other instances -- Displaying it in an ipython kernel should display the object correctly - -### AgentText - -[[autodoc]] smolagents.types.AgentText - -### AgentImage - -[[autodoc]] smolagents.types.AgentImage - -### AgentAudio - -[[autodoc]] smolagents.types.AgentAudio diff --git a/docs/source/zh/reference/tools.mdx b/docs/source/zh/reference/tools.mdx new file mode 100644 index 000000000..86f19dca4 --- /dev/null +++ b/docs/source/zh/reference/tools.mdx @@ -0,0 +1,101 @@ + + +# 工具 + + + +Smolagents 是一个实验性 API,可能会随时更改。由于 API 或底层模型可能发生变化,代理返回的结果可能会有所不同。 + + + +要了解更多关于智能体和工具的信息,请务必阅读[入门指南](../index)。本页面包含底层类的 API 文档。 + +## 工具 + +### load_tool + +[[autodoc]] load_tool + +### tool + +[[autodoc]] tool + +### Tool + +[[autodoc]] Tool + +### launch_gradio_demo + +[[autodoc]] launch_gradio_demo + +## 默认工具 + +### PythonInterpreterTool + +[[autodoc]] PythonInterpreterTool + +### FinalAnswerTool + +[[autodoc]] FinalAnswerTool + +### UserInputTool + +[[autodoc]] UserInputTool + +### DuckDuckGoSearchTool + +[[autodoc]] DuckDuckGoSearchTool + +### GoogleSearchTool + +[[autodoc]] GoogleSearchTool + +### VisitWebpageTool + +[[autodoc]] VisitWebpageTool + +### SpeechToTextTool + +[[autodoc]] SpeechToTextTool + +## 工具集合 + +[[autodoc]] ToolCollection + +## 智能体类型 + +智能体可以处理工具之间的任何类型的对象;工具是完全多模态的,可以接受和返回文本、图像、音频、视频以及其他类型的对象。为了增加工具之间的兼容性,以及正确呈现在 ipython(jupyter、colab、ipython notebooks 等)中的返回结果,我们为这些类型实现了包装类。 + +被包装的对象应该继续保持其初始行为;例如,一个文本对象应继续表现为字符串,一个图像对象应继续表现为 `PIL.Image`。 + +这些类型有三个特定的用途: + +- 调用 `to_raw` 方法时,应返回底层对象 +- 调用 `to_string` 方法时,应将对象转换为字符串:对于 `AgentText` 类型,可以直接返回字符串;对于其他实例,则返回对象序列化版本的路径 +- 在 ipython 内核中显示时,应正确显示对象 + +### AgentText + +[[autodoc]] smolagents.agent_types.AgentText + +### AgentImage + +[[autodoc]] smolagents.agent_types.AgentImage + +### AgentAudio + +[[autodoc]] smolagents.agent_types.AgentAudio diff --git a/docs/source/zh/tutorials/building_good_agents.md b/docs/source/zh/tutorials/building_good_agents.mdx similarity index 98% rename from docs/source/zh/tutorials/building_good_agents.md rename to docs/source/zh/tutorials/building_good_agents.mdx index 47cd202a0..fbf489fae 100644 --- a/docs/source/zh/tutorials/building_good_agents.md +++ b/docs/source/zh/tutorials/building_good_agents.mdx @@ -193,7 +193,7 @@ Final answer: 让我们看看它是如何工作的。例如,让我们检查 [`CodeAgent`] 的默认系统提示(下面的版本通过跳过零样本示例进行了缩短)。 ```python -print(agent.system_prompt_template) +print(agent.prompt_templates["system_prompt"]) ``` 你会得到: ```text @@ -242,15 +242,7 @@ Now Begin! If you solve the task correctly, you will receive a reward of $1,000, 然后你可以根据如下,更改系统提示: ```py -from smolagents.prompts import CODE_SYSTEM_PROMPT - -modified_system_prompt = CODE_SYSTEM_PROMPT + "\nHere you go!" # 在此更改系统提示 - -agent = CodeAgent( - tools=[], - model=HfApiModel(), - system_prompt=modified_system_prompt -) +agent.prompt_templates["system_prompt"] = agent.prompt_templates["system_prompt"] + "\nHere you go!" ``` 这也适用于 [`ToolCallingAgent`]。 diff --git a/docs/source/zh/tutorials/secure_code_execution.md b/docs/source/zh/tutorials/secure_code_execution.mdx similarity index 100% rename from docs/source/zh/tutorials/secure_code_execution.md rename to docs/source/zh/tutorials/secure_code_execution.mdx diff --git a/docs/source/zh/tutorials/tools.md b/docs/source/zh/tutorials/tools.mdx similarity index 100% rename from docs/source/zh/tutorials/tools.md rename to docs/source/zh/tutorials/tools.mdx diff --git a/examples/agent_from_any_llm.py b/examples/agent_from_any_llm.py index eff667f33..86f45effb 100644 --- a/examples/agent_from_any_llm.py +++ b/examples/agent_from_any_llm.py @@ -9,7 +9,7 @@ available_inferences = ["hf_api", "transformers", "ollama", "litellm"] chosen_inference = "transformers" -print(f"Chose model {chosen_inference}") +print(f"Chose model: '{chosen_inference}'") if chosen_inference == "hf_api": model = HfApiModel(model_id="meta-llama/Llama-3.3-70B-Instruct") @@ -22,6 +22,7 @@ model_id="ollama_chat/llama3.2", api_base="http://localhost:11434", # replace with remote open-ai compatible server if necessary api_key="your-api-key", # replace with API key if necessary + num_ctx=8192, # ollama default is 2048 which will often fail horribly. 8192 works for easy tasks, more is better. Check https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator to calculate how much VRAM this will need for the selected model. ) elif chosen_inference == "litellm": @@ -48,4 +49,4 @@ def get_weather(location: str, celsius: Optional[bool] = False) -> str: agent = CodeAgent(tools=[get_weather], model=model) -print("ToolCallingAgent:", agent.run("What's the weather like in Paris?")) +print("CodeAgent:", agent.run("What's the weather like in Paris?")) diff --git a/examples/benchmark.ipynb b/examples/benchmark.ipynb index 065adcecd..79f0ae0a1 100644 --- a/examples/benchmark.ipynb +++ b/examples/benchmark.ipynb @@ -16,190 +16,44 @@ } ], "source": [ - "!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/aymeric/venv/test/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionsourcetrue_answertrue_reasoning
0If Eliud Kipchoge could maintain his record-ma...GAIA17None
1How many studio albums were published by Merce...GAIA3None
2Here's a fun riddle that I think you'll enjoy....GAIA3None
3My family reunion is this week, and I was assi...GAIA2None
4In Emily Midkiff's June 2014 article in a jour...GAIAfluffyNone
...............
127What year was the municipality of San Carlos, ...SimpleQA1786['https://en.wikipedia.org/wiki/San_Carlos,_An...
128In which year was Maria Elena Walsh named Illu...SimpleQA1985['https://en.wikipedia.org/wiki/Mar%C3%ADa_Ele...
129What is the durability of the Istarelle spear ...SimpleQA800['http://demonssouls.wikidot.com/spear', 'http...
130What is the number of the executive order that...SimpleQA7034['https://www.loc.gov/collections/federal-thea...
131Within plus or minus one minute, when was Marq...SimpleQA77['https://www.fifa.com/fifaplus/en/match-centr...
\n", - "

132 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " question source true_answer \\\n", - "0 If Eliud Kipchoge could maintain his record-ma... GAIA 17 \n", - "1 How many studio albums were published by Merce... GAIA 3 \n", - "2 Here's a fun riddle that I think you'll enjoy.... GAIA 3 \n", - "3 My family reunion is this week, and I was assi... GAIA 2 \n", - "4 In Emily Midkiff's June 2014 article in a jour... GAIA fluffy \n", - ".. ... ... ... \n", - "127 What year was the municipality of San Carlos, ... SimpleQA 1786 \n", - "128 In which year was Maria Elena Walsh named Illu... SimpleQA 1985 \n", - "129 What is the durability of the Istarelle spear ... SimpleQA 800 \n", - "130 What is the number of the executive order that... SimpleQA 7034 \n", - "131 Within plus or minus one minute, when was Marq... SimpleQA 77 \n", - "\n", - " true_reasoning \n", - "0 None \n", - "1 None \n", - "2 None \n", - "3 None \n", - "4 None \n", - ".. ... \n", - "127 ['https://en.wikipedia.org/wiki/San_Carlos,_An... \n", - "128 ['https://en.wikipedia.org/wiki/Mar%C3%ADa_Ele... \n", - "129 ['http://demonssouls.wikidot.com/spear', 'http... \n", - "130 ['https://www.loc.gov/collections/federal-thea... \n", - "131 ['https://www.fifa.com/fifaplus/en/match-centr... \n", - "\n", - "[132 rows x 4 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import datasets\n", - "import pandas as pd\n", - "\n", - "\n", - "eval_ds = datasets.load_dataset(\"m-ric/smol_agents_benchmark\")[\"test\"]\n", - "pd.DataFrame(eval_ds)" + "!pip install -e .. datasets sympy numpy matplotlib seaborn -q # Install dev version of smolagents + some packages" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Define utilities and tools\n", - "To run the SERPAPI tool, you will need to have a [SerpAPI](https://serpapi.com/dashboard) API key: for this you need a paid account." + "## Constants and utilities/tools" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "# Benchmark date\n", + "# - set a concrete date:\n", + "DATE = \"2024-12-26\"\n", + "# - or use default: today\n", + "# DATE = None\n", + "\n", + "# Evaluation dataset\n", + "# - the dataset is gated, so you must first visit its page to request access: https://huggingface.co/datasets/smolagents-benchmark/benchmark-v1\n", + "EVAL_DATASET = \"smolagents-benchmark/benchmark-v1\"\n", + "\n", + "# Answers dataset: it must be a gated dataset; required to score the answers\n", + "ANSWERS_DATASET = \"smolagents-benchmark/answers\"\n", + "# Whether to push the answers dataset to the Hub\n", + "PUSH_ANSWERS_DATASET_TO_HUB = True\n", + "\n", + "# Results dataset\n", + "RESULTS_DATASET = \"smolagents-benchmark/results\"\n", + "# Whether to push the results dataset to the Hub\n", + "PUSH_RESULTS_DATASET_TO_HUB = True\n", + "\n", + "\n", + "import datetime\n", "import json\n", "import os\n", "import re\n", @@ -208,6 +62,7 @@ "import warnings\n", "from typing import List\n", "\n", + "import datasets\n", "from dotenv import load_dotenv\n", "from tqdm import tqdm\n", "\n", @@ -234,60 +89,85 @@ " return str(obj)\n", "\n", "\n", - "def answer_questions(eval_ds, file_name, agent, model_id, action_type, is_vanilla_llm=False):\n", - " answered_questions = []\n", - " if os.path.exists(file_name):\n", - " with open(file_name, \"r\") as f:\n", - " for line in f:\n", - " answered_questions.append(json.loads(line)[\"question\"])\n", - "\n", - " for _, example in tqdm(enumerate(eval_ds), total=len(eval_ds)):\n", - " try:\n", - " question = example[\"question\"]\n", - " if example[\"source\"] == \"SimpleQA\":\n", - " question += \" Answer with only the final number.\"\n", - " if example[\"source\"] == \"MATH\":\n", - " question += \" Write code, not latex.\"\n", - " if question in answered_questions:\n", - " continue\n", - " start_time = time.time()\n", - "\n", - " if is_vanilla_llm:\n", - " llm = agent\n", - " answer = str(llm([{\"role\": \"user\", \"content\": question}]).content)\n", - " token_count = {\n", - " \"input\": llm.last_input_token_count,\n", - " \"output\": llm.last_output_token_count,\n", + "def answer_questions(\n", + " eval_ds,\n", + " agent,\n", + " model_id,\n", + " action_type,\n", + " is_vanilla_llm=False,\n", + " date=DATE,\n", + " output_dir=\"output\",\n", + " push_to_hub_dataset=ANSWERS_DATASET if PUSH_ANSWERS_DATASET_TO_HUB else None,\n", + "):\n", + " date = date or datetime.date.today().isoformat()\n", + "\n", + " for task in eval_ds:\n", + " file_name = f\"output/{model_id.replace('/', '__')}__{action_type}__{task}__{date}.jsonl\"\n", + " answered_questions = []\n", + " if os.path.exists(file_name):\n", + " with open(file_name, \"r\") as f:\n", + " for line in f:\n", + " answered_questions.append(json.loads(line)[\"question\"])\n", + "\n", + " for _, example in tqdm(enumerate(eval_ds[task]), total=len(eval_ds[task])):\n", + " try:\n", + " question = example[\"question\"]\n", + " if example[\"source\"] == \"SimpleQA\":\n", + " question += \" Answer with only the final number.\"\n", + " if example[\"source\"] == \"MATH\":\n", + " question += \" Write code, not latex.\"\n", + " if question in answered_questions:\n", + " continue\n", + " start_time = time.time()\n", + "\n", + " if is_vanilla_llm:\n", + " llm = agent\n", + " answer = str(llm([{\"role\": \"user\", \"content\": question}]).content)\n", + " token_count = {\n", + " \"input\": llm.last_input_token_count,\n", + " \"output\": llm.last_output_token_count,\n", + " }\n", + " intermediate_steps = str([])\n", + " else:\n", + " answer = str(agent.run(question))\n", + " token_count = agent.monitor.get_total_token_counts()\n", + " intermediate_steps = str(agent.logs)\n", + " # Remove memory from logs to make them more compact.\n", + " for step in agent.logs:\n", + " if isinstance(step, ActionStep):\n", + " step.agent_memory = None\n", + "\n", + " end_time = time.time()\n", + " annotated_example = {\n", + " \"model_id\": model_id,\n", + " \"agent_action_type\": action_type,\n", + " \"question\": question,\n", + " \"answer\": answer,\n", + " \"true_answer\": example[\"true_answer\"],\n", + " \"source\": example[\"source\"],\n", + " \"intermediate_steps\": intermediate_steps,\n", + " \"start_time\": start_time,\n", + " \"end_time\": end_time,\n", + " \"token_counts\": token_count,\n", " }\n", - " intermediate_steps = str([])\n", - " else:\n", - " answer = str(agent.run(question))\n", - " token_count = agent.monitor.get_total_token_counts()\n", - " intermediate_steps = str(agent.logs)\n", - " # Remove memory from logs to make them more compact.\n", - " for step in agent.logs:\n", - " if isinstance(step, ActionStep):\n", - " step.agent_memory = None\n", - "\n", - " end_time = time.time()\n", - " annotated_example = {\n", - " \"model_id\": model_id,\n", - " \"agent_action_type\": action_type,\n", - " \"question\": question,\n", - " \"answer\": answer,\n", - " \"true_answer\": example[\"true_answer\"],\n", - " \"source\": example[\"source\"],\n", - " \"intermediate_steps\": intermediate_steps,\n", - " \"start_time\": start_time,\n", - " \"end_time\": end_time,\n", - " \"token_counts\": token_count,\n", - " }\n", - "\n", - " with open(file_name, \"a\") as f:\n", - " json.dump(annotated_example, f, default=serialize_agent_error)\n", - " f.write(\"\\n\") # add a newline for JSONL format\n", - " except Exception as e:\n", - " print(\"Failed:\", e)\n", + "\n", + " with open(file_name, \"a\") as f:\n", + " json.dump(annotated_example, f, default=serialize_agent_error)\n", + " f.write(\"\\n\") # add a newline for JSONL format\n", + " except Exception as e:\n", + " print(\"Failed:\", e)\n", + "\n", + " if push_to_hub_dataset:\n", + " ds = datasets.Dataset.from_pandas(pd.read_json(file_name, lines=True), split=\"test\", preserve_index=False)\n", + " config = f\"{model_id.replace('/', '__')}__{action_type}__{task}\"\n", + " data_dir = f\"{model_id}/{action_type}/{task}/{date}\"\n", + " ds.push_to_hub(\n", + " push_to_hub_dataset,\n", + " config_name=config,\n", + " data_dir=data_dir,\n", + " split=\"test\",\n", + " commit_message=f\"Upload {config}\",\n", + " )\n", "\n", "\n", "def normalize_number_str(number_str: str) -> float:\n", @@ -382,7 +262,172 @@ " return all(comparisons)\n", "\n", " else: # if gt is a str\n", - " return normalize_str(model_answer) == normalize_str(ground_truth)" + " return normalize_str(model_answer) == normalize_str(ground_truth)\n", + "\n", + "\n", + "def get_correct(row):\n", + " if row[\"source\"] == \"MATH\": # Checks the last number in answer\n", + " numbers_answer = extract_numbers(str(row[\"answer\"]))\n", + " if len(numbers_answer) == 0:\n", + " return False\n", + " return float(numbers_answer[-1]) == float(row[\"true_answer\"])\n", + " else:\n", + " return get_question_score_gaia(str(row[\"answer\"]), str(row[\"true_answer\"]))\n", + "\n", + "\n", + "def score_answers(\n", + " answers_subsets,\n", + " answers_dataset=ANSWERS_DATASET,\n", + " date=DATE,\n", + " push_to_hub_dataset=RESULTS_DATASET if PUSH_RESULTS_DATASET_TO_HUB else None,\n", + " set_default=True,\n", + "):\n", + " if not answers_dataset:\n", + " raise ValueError(\"Pass 'answers_dataset' to load the answers from it\")\n", + " date = date or datetime.date.today().isoformat()\n", + " results = []\n", + " for answers_subset in answers_subsets:\n", + " *model_id, action_type, task = answers_subset.split(\"__\")\n", + " model_id = \"/\".join(model_id)\n", + " ds = datasets.load_dataset(answers_dataset, answers_subset, split=\"test\")\n", + " df = ds.to_pandas()\n", + " df[\"correct\"] = df.apply(get_correct, axis=1)\n", + " acc = df[\"correct\"].mean().item()\n", + " result = df.loc[0, [\"model_id\", \"agent_action_type\", \"source\"]].to_dict()\n", + " result[\"acc\"] = acc\n", + " results.append(result)\n", + " df = pd.DataFrame(results)\n", + "\n", + " if push_to_hub_dataset:\n", + " ds = datasets.Dataset.from_pandas(df)\n", + " config = date\n", + " set_default = set_default\n", + " ds.push_to_hub(\n", + " push_to_hub_dataset, config_name=config, set_default=set_default, commit_message=f\"Upload {config} results\"\n", + " )\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['gaia', 'math', 'simpleqa']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionsourcetrue_answertrue_reasoning
0What year was the municipality of Ramiriquí, B...SimpleQA1541['https://en.wikipedia.org/wiki/Ramiriqu%C3%AD...
1In what year did Hjalmar Hvam invent a mechani...SimpleQA1937['https://www.kgw.com/article/features/portlan...
2In which year did Fayaz A. Malik (an Indian ph...SimpleQA2009['https://en.wikipedia.org/wiki/Fayaz_A._Malik...
3In which year was John B. Goodenough elected a...SimpleQA2010['https://en.wikipedia.org/wiki/John_B._Gooden...
4In which year did Atul Gawande earn an M.A. in...SimpleQA1989['https://en.wikipedia.org/wiki/Atul_Gawande',...
\n", + "
" + ], + "text/plain": [ + " question source true_answer \\\n", + "0 What year was the municipality of Ramiriquí, B... SimpleQA 1541 \n", + "1 In what year did Hjalmar Hvam invent a mechani... SimpleQA 1937 \n", + "2 In which year did Fayaz A. Malik (an Indian ph... SimpleQA 2009 \n", + "3 In which year was John B. Goodenough elected a... SimpleQA 2010 \n", + "4 In which year did Atul Gawande earn an M.A. in... SimpleQA 1989 \n", + "\n", + " true_reasoning \n", + "0 ['https://en.wikipedia.org/wiki/Ramiriqu%C3%AD... \n", + "1 ['https://www.kgw.com/article/features/portlan... \n", + "2 ['https://en.wikipedia.org/wiki/Fayaz_A._Malik... \n", + "3 ['https://en.wikipedia.org/wiki/John_B._Gooden... \n", + "4 ['https://en.wikipedia.org/wiki/Atul_Gawande',... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "# Choose the tasks to evaluate on:\n", + "# tasks = [\"gaia\"]\n", + "# or evaluate on all tasks: [\"gaia\", \"math\", \"simpleqa\"]\n", + "tasks = datasets.get_dataset_config_names(EVAL_DATASET)\n", + "print(tasks)\n", + "\n", + "\n", + "eval_ds = {task: datasets.load_dataset(EVAL_DATASET, task, split=\"test\") for task in tasks}\n", + "pd.DataFrame(eval_ds[\"simpleqa\"]).head()" ] }, { @@ -412,16 +457,16 @@ " # \"meta-llama/Llama-3.1-70B-Instruct\",\n", "]\n", "\n", + "\n", "for model_id in open_model_ids:\n", " print(f\"Evaluating '{model_id}'...\")\n", - " # action_type = \"tool_calling\"\n", + " # action_type = \"tool-calling\"\n", " # agent = ToolCallingAgent(\n", " # tools=[GoogleSearchTool(), VisitWebpageTool(), PythonInterpreterTool()],\n", " # model=HfApiModel(model_id),\n", " # max_steps=10,\n", " # )\n", - " # file_name = f\"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl\"\n", - " # answer_questions(eval_ds, file_name, agent, model_id, action_type)\n", + " # answer_questions(eval_ds, agent, model_id, action_type)\n", "\n", " action_type = \"code\"\n", " agent = CodeAgent(\n", @@ -430,21 +475,19 @@ " additional_authorized_imports=[\"numpy\", \"sympy\"],\n", " max_steps=10,\n", " )\n", - " file_name = f\"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl\"\n", - " answer_questions(eval_ds, file_name, agent, model_id, action_type)\n", + " answer_questions(eval_ds, agent, model_id, action_type)\n", "\n", " # Also evaluate vanilla model\n", " action_type = \"vanilla\"\n", " llm = HfApiModel(model_id)\n", - " file_name = f\"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl\"\n", - " answer_questions(eval_ds, file_name, llm, model_id, action_type, is_vanilla_llm=True)" + " answer_questions(eval_ds, llm, model_id, action_type, is_vanilla_llm=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Closed models" + "### Closed models" ] }, { @@ -458,9 +501,10 @@ "\n", "litellm_model_ids = [\"gpt-4o\", \"anthropic/claude-3-5-sonnet-latest\"]\n", "\n", + "\n", "for model_id in litellm_model_ids:\n", " print(f\"Evaluating '{model_id}'...\")\n", - " action_type = \"tool_calling\"\n", + " action_type = \"tool-calling\"\n", " agent = ToolCallingAgent(\n", " tools=[\n", " GoogleSearchTool(),\n", @@ -470,8 +514,7 @@ " model=LiteLLMModel(model_id),\n", " max_steps=10,\n", " )\n", - " file_name = f\"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl\"\n", - " answer_questions(eval_ds, file_name, agent, model_id, action_type)\n", + " answer_questions(eval_ds, agent, model_id, action_type)\n", "\n", " action_type = \"code\"\n", " agent = CodeAgent(\n", @@ -480,14 +523,12 @@ " additional_authorized_imports=[\"numpy\", \"sympy\"],\n", " max_steps=10,\n", " )\n", - " file_name = f\"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl\"\n", - " answer_questions(eval_ds, file_name, agent, model_id, action_type)\n", + " answer_questions(eval_ds, agent, model_id, action_type)\n", "\n", " # Also evaluate vanilla model\n", " action_type = \"vanilla\"\n", " llm = LiteLLMModel(model_id)\n", - " file_name = f\"output/{model_id.replace('/', '_')}-{action_type}-26-dec-2024.jsonl\"\n", - " answer_questions(eval_ds, file_name, llm, model_id, action_type, is_vanilla_llm=True)" + " answer_questions(eval_ds, llm, model_id, action_type, is_vanilla_llm=True)" ] }, { @@ -539,58 +580,153 @@ "execution_count": 9, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of answers_subsets 54\n", + "Example of answers_subset Qwen__Qwen2.5-72B-Instruct__code__gaia\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_27085/1901135017.py:164: UserWarning: Answer lists have different lengths, returning False.\n", - " warnings.warn(\n" + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n", + "/tmp/ipykernel_640885/2542893079.py:194: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
model_idagent_action_typesourceacc
0Qwen/Qwen2.5-72B-InstructcodeGAIA28.12
1Qwen/Qwen2.5-72B-InstructcodeMATH76.00
2Qwen/Qwen2.5-72B-InstructcodeSimpleQA88.00
3Qwen/Qwen2.5-72B-InstructvanillaGAIA6.25
4Qwen/Qwen2.5-72B-InstructvanillaMATH30.00
\n", + "
" + ], + "text/plain": [ + " model_id agent_action_type source acc\n", + "0 Qwen/Qwen2.5-72B-Instruct code GAIA 28.12\n", + "1 Qwen/Qwen2.5-72B-Instruct code MATH 76.00\n", + "2 Qwen/Qwen2.5-72B-Instruct code SimpleQA 88.00\n", + "3 Qwen/Qwen2.5-72B-Instruct vanilla GAIA 6.25\n", + "4 Qwen/Qwen2.5-72B-Instruct vanilla MATH 30.00" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "import glob\n", - "\n", + "import datasets\n", "import pandas as pd\n", "\n", "\n", - "res = []\n", - "for file_path in glob.glob(\"output/*.jsonl\"):\n", - " data = []\n", - " with open(file_path) as f:\n", - " for line in f:\n", - " try:\n", - " # Use standard json module instead of pandas.json to handle large numbers better\n", - " record = json.loads(line)\n", - " data.append(record)\n", - " except json.JSONDecodeError as e:\n", - " print(f\"Error parsing line in {file_path}: {e}\")\n", - " continue\n", - "\n", - " try:\n", - " smoldf = pd.DataFrame(data)\n", - " smoldf[\"action_type\"] = \"vanilla\" if \"-vanilla-\" in file_path else \"code\"\n", - " res.append(smoldf)\n", - " except Exception as e:\n", - " print(f\"Error creating DataFrame from {file_path}: {e}\")\n", - " continue\n", - "\n", - "result_df = pd.concat(res)\n", - "\n", - "\n", - "def get_correct(row):\n", - " if row[\"source\"] == \"MATH\": # Checks the last number in answer\n", - " numbers_answer = extract_numbers(str(row[\"answer\"]))\n", - " if len(numbers_answer) == 0:\n", - " return False\n", - " return float(numbers_answer[-1]) == float(row[\"true_answer\"])\n", - " else:\n", - " return get_question_score_gaia(str(row[\"answer\"]), str(row[\"true_answer\"]))\n", - "\n", + "# Choose the answers subsets to score:\n", + "# answers_subsets = [\"meta-llama__Llama-3.1-8B-Instruct__code__gaia\"]\n", + "# or get all the answers subsets present in the ANSWERS_DATASET\n", + "answers_subsets = datasets.get_dataset_config_names(ANSWERS_DATASET)\n", + "print(\"Number of answers_subsets\", len(answers_subsets))\n", + "print(\"Example of answers_subset\", answers_subsets[0])\n", "\n", - "result_df[\"correct\"] = result_df.apply(get_correct, axis=1)\n", "\n", - "result_df = (result_df.groupby([\"model_id\", \"source\", \"action_type\"])[[\"correct\"]].mean() * 100).round(1).reset_index()" + "result_df = score_answers(answers_subsets)\n", + "result_df[\"acc\"] = (result_df[\"acc\"] * 100).round(2)\n", + "result_df.head()" ] }, { diff --git a/examples/e2b_example.py b/examples/e2b_example.py index a58c7b169..18354a372 100644 --- a/examples/e2b_example.py +++ b/examples/e2b_example.py @@ -42,7 +42,7 @@ def forward(self): ) agent.run( - "Return me an image of a cat. Directly use the image provided in your state.", + "Calculate how much is 2+2, then return me an image of a cat. Directly use the image provided in your state.", additional_args={"cat_image": get_cat_image()}, ) # Asking to directly return the image from state tests that additional_args are properly sent to server. diff --git a/examples/inspect_multiagent_run.py b/examples/inspect_multiagent_run.py new file mode 100644 index 000000000..8c1c98d46 --- /dev/null +++ b/examples/inspect_multiagent_run.py @@ -0,0 +1,33 @@ +from openinference.instrumentation.smolagents import SmolagentsInstrumentor +from phoenix.otel import register + + +register() +SmolagentsInstrumentor().instrument(skip_dep_check=True) + + +from smolagents import ( + CodeAgent, + DuckDuckGoSearchTool, + HfApiModel, + ToolCallingAgent, + VisitWebpageTool, +) + + +# Then we run the agentic part! +model = HfApiModel() + +search_agent = ToolCallingAgent( + tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + model=model, + name="search_agent", + description="This is an agent that can do web search.", +) + +manager_agent = CodeAgent( + tools=[], + model=model, + managed_agents=[search_agent], +) +manager_agent.run("If the US keeps it 2024 growth rate, how many years would it take for the GDP to double?") diff --git a/examples/inspect_runs.py b/examples/inspect_runs.py deleted file mode 100644 index 9322f0bac..000000000 --- a/examples/inspect_runs.py +++ /dev/null @@ -1,40 +0,0 @@ -from openinference.instrumentation.smolagents import SmolagentsInstrumentor -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import SimpleSpanProcessor - -from smolagents import ( - CodeAgent, - DuckDuckGoSearchTool, - HfApiModel, - ManagedAgent, - ToolCallingAgent, - VisitWebpageTool, -) - - -# Let's setup the instrumentation first - -trace_provider = TracerProvider() -trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter("http://0.0.0.0:6006/v1/traces"))) - -SmolagentsInstrumentor().instrument(tracer_provider=trace_provider, skip_dep_check=True) - -# Then we run the agentic part! -model = HfApiModel() - -agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], - model=model, -) -managed_agent = ManagedAgent( - agent=agent, - name="managed_agent", - description="This is an agent that can do web search.", -) -manager_agent = CodeAgent( - tools=[], - model=model, - managed_agents=[managed_agent], -) -manager_agent.run("If the US keeps it 2024 growth rate, how many years would it take for the GDP to double?") diff --git a/examples/open_deep_research/README.md b/examples/open_deep_research/README.md new file mode 100644 index 000000000..915bfc894 --- /dev/null +++ b/examples/open_deep_research/README.md @@ -0,0 +1,22 @@ +# Open Deep Research + +Welcome to this open replication of [OpenAI's Deep Research](https://openai.com/index/introducing-deep-research/)! + +Read more about this implementation's goal and methods [in our blog post](https://huggingface.co/blog/open-deep-research). + +This agent achieves 55% pass@1 on GAIA validation set, vs 67% for Deep Research. + +To install it, first run +```bash +pip install -r requirements.txt +``` + +And install smolagents dev version +```bash +pip install smolagents[dev] +``` + +Then you're good to go! Run the run.py script, as in: +```bash +python run.py --model-id "o1" "Your question here!" +``` diff --git a/examples/open_deep_research/analysis.ipynb b/examples/open_deep_research/analysis.ipynb new file mode 100644 index 000000000..04f315fdd --- /dev/null +++ b/examples/open_deep_research/analysis.ipynb @@ -0,0 +1,12151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install plotly kaleido datasets nbformat -U -q" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/aymeric/venv/gaia/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "import datasets\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "from huggingface_hub import login\n", + "\n", + "\n", + "load_dotenv(override=True)\n", + "login(os.getenv(\"HF_TOKEN\"))\n", + "\n", + "pd.set_option(\"max_colwidth\", None)\n", + "\n", + "OUTPUT_DIR = \"output\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]\n", + "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})\n", + "eval_df = pd.DataFrame(eval_ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 86\n", + "1 53\n", + "3 26\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.Series(eval_ds[\"task\"]).value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Load all results" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "\n", + "\n", + "results = []\n", + "for f in glob.glob(f\"{OUTPUT_DIR}/validation/*.jsonl\"):\n", + " df = pd.read_json(f, lines=True)\n", + " df[\"agent_name\"] = f.split(\"/\")[-1].split(\".\")[0]\n", + " results.append(df)\n", + "\n", + "result_df = pd.concat(results)\n", + "result_df = result_df.drop(columns=[\"start_time\", \"end_time\"])\n", + "result_df[\"prediction\"] = result_df[\"prediction\"].fillna(\"No prediction\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "String cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 2 High fantasy A Song of Ice and Fire cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String 94 CFM for Cheater cannot be normalized to number str.\n", + "String 93 CFM for Cheater beater cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 3 or 4 cannot be normalized to number str.\n", + "String No year cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 250 for Cheater cannot be normalized to number str.\n", + "String 220 for Cheater beater cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String 776 ft/min for Cheater cannot be normalized to number str.\n", + "String 768 ft/min for Cheater beater cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String CFM number for Cheater: not listed cannot be normalized to number str.\n", + "String CFM number for Cheater beater: 665 ft/min cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 1.46 Å cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String August 1: 0 August 2: 0 August 3: 0 August 4: 0 August 5: 0 August 6: 0 August 7: 0 August 8: 0 August 9: 0 August 10: 0 August 11: 0 August 12: 0 August 13: 0 August 14: 0 August 15: 0 August 16: 0 August 17: 0 August 18: 0 August 19: 0 August 20: 0 August 21: 0 August 22: 0 August 23: 0 August 24: 0 August 25: 0 August 26: 0 August 27: 0 August 28: 0 August 29: 0 August 30: 0 August 31: 0 cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String 120 for Cheater cannot be normalized to number str.\n", + "String 103 for Cheater beater cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 120.28 for Cheater cannot be normalized to number str.\n", + "String 119.04 for Cheater beater cannot be normalized to number str.\n", + "String 3 or 4 cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 2730-2740 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 89706.00 USD cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String No prediction cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 6 The Lord of the Rings (book) J. R. R. Tolkien Author American literature Fantasy literature Publishers A Song of Ice and Fire cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 1.46 Å cannot be normalized to number str.\n", + "String cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 94.5 for Cheater cannot be normalized to number str.\n", + "String 93.5 for Cheater beater cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 776 for Cheater cannot be normalized to number str.\n", + "String Not specified for Cheater Beater cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 5.75 for Cheater cannot be normalized to number str.\n", + "String 5.22 for Cheater Beater cannot be normalized to number str.\n", + "String 2017 Komo Mai Drive sold for 900000 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String 33101 28557 cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "String Unable to determine cannot be normalized to number str.\n", + "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n", + "Close call: Rockhopper Penguins vs Rockhopper penguin\n", + "Close call: INT. THE CASTLE vs THE CASTLE\n", + "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n", + "Close call: The World of the Twenty First Century 1994 vs The World of the Twenty First Century\n", + "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n", + "Close call: Wes Craven's A Nightmare on Elm Street vs A Nightmare on Elm Street\n", + "Close call: God said let there be dragons vs Here be dragons\n", + "Close call: rockhopper penguins vs Rockhopper penguin\n", + "Close call: Harbinger, This Fire, Tidal vs Harbinger, Tidal\n", + "Close call: EC 3.1.3.1;EC 1.11.1.7 vs 3.1.3.1; 1.11.1.7\n", + "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n", + "Close call: Alfonso Cardinal Visconti vs Alfonso Visconti\n", + "Close call: to be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n", + "Close call: Out of the Silent Planet by C.S. Lewis vs Out of the Silent Planet\n", + "Close call: broccoli, celery, fresh basil, green beans, lettuce, sweet potatoes vs broccoli, celery, fresh basil, lettuce, sweet potatoes\n", + "Close call: To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune or to take arms against a sea of troubles and by opposing end them vs To be or not to be that is the question whether tis nobler in the mind to suffer the slings and arrows of outrageous fortune\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/aymeric/Documents/Code/smolagents/examples/open_deep_research/scripts/gaia_scorer.py:52: UserWarning: Answer lists have different lengths, returning False.\n", + " warnings.warn(\"Answer lists have different lengths, returning False.\", UserWarning)\n" + ] + } + ], + "source": [ + "import re\n", + "from collections import Counter\n", + "\n", + "from scripts.gaia_scorer import check_close_call, question_scorer\n", + "\n", + "\n", + "result_df[\"is_correct\"] = result_df.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n", + "result_df[\"is_near_correct\"] = result_df.apply(\n", + " lambda x: check_close_call(x[\"prediction\"], x[\"true_answer\"], x[\"is_correct\"]),\n", + " axis=1,\n", + ")\n", + "\n", + "result_df[\"count_steps\"] = result_df[\"intermediate_steps\"].apply(len)\n", + "\n", + "\n", + "def find_attachment(question):\n", + " matches = eval_df.loc[eval_df[\"question\"].apply(lambda x: x in question), \"file_name\"]\n", + "\n", + " if len(matches) == 0:\n", + " return \"Not found\"\n", + " file_path = matches.values[0]\n", + "\n", + " if isinstance(file_path, str) and len(file_path) > 0:\n", + " return file_path.split(\".\")[-1]\n", + " else:\n", + " return \"None\"\n", + "\n", + "\n", + "result_df[\"attachment_type\"] = result_df[\"question\"].apply(find_attachment)\n", + "\n", + "\n", + "def extract_tool_calls(code):\n", + " regex = r\"\\b(\\w+)\\(\"\n", + " function_calls = [el for el in re.findall(regex, code) if el.islower()]\n", + "\n", + " function_call_counter = Counter(function_calls)\n", + " return function_call_counter\n", + "\n", + "\n", + "def sum_tool_calls(steps):\n", + " total_count = Counter()\n", + " for step in steps:\n", + " if \"llm_output\" in step:\n", + " total_count += extract_tool_calls(step[\"llm_output\"])\n", + "\n", + " return total_count\n", + "\n", + "\n", + "# result_df[\"tool_calls\"] = result_df[\"intermediate_steps\"].apply(sum_tool_calls)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def get_thoughts(x):\n", + " try:\n", + " output = x[0][\"task\"]\n", + " for y in x[1:]:\n", + " try:\n", + " if \"observation\" in y:\n", + " output += y[\"llm_output\"] + \"\\nObservation:\" + y[\"observation\"]\n", + " else:\n", + " output += y[\"llm_output\"] + r\"\\Error:\" + str(y[\"error\"])\n", + " except Exception:\n", + " pass\n", + " return output\n", + " except Exception:\n", + " return None\n", + "\n", + "\n", + "result_df[\"thoughts\"] = result_df[\"intermediate_steps\"].apply(lambda x: get_thoughts(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "agent_name\n", + "code_gpt4o_03_february_text 165\n", + "code_o1_03_february_ablation-toolcalling-manager 165\n", + "code_o1_01_february_text 165\n", + "code_o3-mini_03_february_remove-navigational 165\n", + "code_o1_04_february_submission5 165\n", + "code_o1_03_february_text_high-reasoning-effort 165\n", + "code_o1_03_february_remove-navigational 164\n", + "code_o1_03_february_fix-print-outputs 164\n", + "code_o1_04_february_submission 162\n", + "code_o1_03_february_goodoldtext-unbroken 161\n", + "code_gpt4o_03_february_goodoldtext-unbroken 159\n", + "code_gpt4o_03_february_magenticbrowser 159\n", + "code_o1_03_february_fix-print-outputs2 156\n", + "code_gpt4o_03_february_magenticbrowser2 156\n", + "code_o1_04_february_submission-medium 125\n", + "code_o1_29-01_text 105\n", + "code_llama-3 90\n", + "code_o1_22-01_managedagent-summary_planning 67\n", + "code_o1_25-01_visioon 53\n", + "code_o1_04_february_submission3 49\n", + "code_qwen-coder-32B_03_february_text 43\n", + "code_o1_04_february_submission4 6\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result_df[\"agent_name\"].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Inspect specific runs" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "agent_name\n", + "code_gpt4o_03_february_text 165\n", + "code_o1_03_february_ablation-toolcalling-manager 165\n", + "code_o1_01_february_text 165\n", + "code_o3-mini_03_february_remove-navigational 165\n", + "code_o1_04_february_submission5 165\n", + "code_o1_03_february_text_high-reasoning-effort 165\n", + "code_o1_03_february_remove-navigational 164\n", + "code_o1_03_february_fix-print-outputs 164\n", + "code_o1_04_february_submission 162\n", + "code_o1_03_february_goodoldtext-unbroken 161\n", + "code_gpt4o_03_february_goodoldtext-unbroken 159\n", + "code_gpt4o_03_february_magenticbrowser 159\n", + "code_o1_03_february_fix-print-outputs2 156\n", + "code_gpt4o_03_february_magenticbrowser2 156\n", + "code_o1_04_february_submission-medium 125\n", + "code_o1_29-01_text 105\n", + "code_llama-3 90\n", + "code_o1_22-01_managedagent-summary_planning 67\n", + "code_o1_25-01_visioon 53\n", + "code_o1_04_february_submission3 49\n", + "code_qwen-coder-32B_03_february_text 43\n", + "code_o1_04_february_submission4 6\n", + "Name: count, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "agent_name task\n", + "code_gpt4o_03_february_goodoldtext-unbroken 2 84\n", + " 1 53\n", + " 3 22\n", + "code_gpt4o_03_february_magenticbrowser 2 83\n", + " 1 52\n", + " ..\n", + "code_o3-mini_03_february_remove-navigational 1 53\n", + " 3 26\n", + "code_qwen-coder-32B_03_february_text 2 22\n", + " 1 14\n", + " 3 7\n", + "Name: count, Length: 65, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total length: 2809 - is complete: False\n" + ] + } + ], + "source": [ + "o1_vision = \"code_o1_25-01_visioon\"\n", + "o1_next = \"code_o1_29-01_text\"\n", + "o1 = \"code_o1_01_february_text\"\n", + "\n", + "list_versions = [o1, o1_vision, o1_next]\n", + "\n", + "# submission_selection_name = \"react_code_llama3-70b_02-05_full-gaia-validation-code\"\n", + "sel_df = result_df\n", + "# sel_df = sel_df.loc[\n", + "# (result_df[\"agent_name\"].isin(list_versions))\n", + "# # & (~result_df[\"question\"].isin(UNSOLVED_QUESTIONS))\n", + "# ]\n", + "sel_df = sel_df.reset_index(drop=True)\n", + "display(sel_df[\"agent_name\"].value_counts())\n", + "sel_df = sel_df.drop_duplicates(subset=[\"agent_name\", \"question\"])\n", + "display(sel_df.groupby(\"agent_name\")[[\"task\"]].value_counts())\n", + "print(\"Total length:\", len(sel_df), \"- is complete:\", len(sel_df) == 165)\n", + "# assert sel_df[\"question\"].value_counts().max() == len(list_versions), \"Some questions are duplicate!\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Average score:'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
is_correct
agent_name
code_gpt4o_03_february_goodoldtext-unbroken0.384
code_gpt4o_03_february_magenticbrowser0.352
code_gpt4o_03_february_magenticbrowser20.365
code_gpt4o_03_february_text0.376
code_llama-30.078
code_o1_01_february_text0.491
code_o1_03_february_ablation-toolcalling-manager0.327
code_o1_03_february_fix-print-outputs0.518
code_o1_03_february_fix-print-outputs20.558
code_o1_03_february_goodoldtext-unbroken0.534
code_o1_03_february_remove-navigational0.537
code_o1_03_february_text_high-reasoning-effort0.485
code_o1_04_february_submission0.494
code_o1_04_february_submission-medium0.488
code_o1_04_february_submission30.490
code_o1_04_february_submission40.500
code_o1_04_february_submission50.552
code_o1_22-01_managedagent-summary_planning0.418
code_o1_25-01_visioon0.340
code_o1_29-01_text0.390
code_o3-mini_03_february_remove-navigational0.291
code_qwen-coder-32B_03_february_text0.209
\n", + "
" + ], + "text/plain": [ + " is_correct\n", + "agent_name \n", + "code_gpt4o_03_february_goodoldtext-unbroken 0.384\n", + "code_gpt4o_03_february_magenticbrowser 0.352\n", + "code_gpt4o_03_february_magenticbrowser2 0.365\n", + "code_gpt4o_03_february_text 0.376\n", + "code_llama-3 0.078\n", + "code_o1_01_february_text 0.491\n", + "code_o1_03_february_ablation-toolcalling-manager 0.327\n", + "code_o1_03_february_fix-print-outputs 0.518\n", + "code_o1_03_february_fix-print-outputs2 0.558\n", + "code_o1_03_february_goodoldtext-unbroken 0.534\n", + "code_o1_03_february_remove-navigational 0.537\n", + "code_o1_03_february_text_high-reasoning-effort 0.485\n", + "code_o1_04_february_submission 0.494\n", + "code_o1_04_february_submission-medium 0.488\n", + "code_o1_04_february_submission3 0.490\n", + "code_o1_04_february_submission4 0.500\n", + "code_o1_04_february_submission5 0.552\n", + "code_o1_22-01_managedagent-summary_planning 0.418\n", + "code_o1_25-01_visioon 0.340\n", + "code_o1_29-01_text 0.390\n", + "code_o3-mini_03_february_remove-navigational 0.291\n", + "code_qwen-coder-32B_03_february_text 0.209" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
is_correctis_near_correctcount_stepscount
agent_nametask
code_gpt4o_03_february_goodoldtext-unbroken10.4528300.4528307.00000053
20.3809520.3928578.51190584
30.2272730.22727310.40909122
code_gpt4o_03_february_magenticbrowser10.4807690.4807697.15384652
20.3493980.3614468.16867583
..................
code_o3-mini_03_february_remove-navigational20.2325580.2441864.97674486
30.1538460.1538466.61538526
code_qwen-coder-32B_03_february_text10.3571430.3571435.42857114
20.1363640.1363646.40909122
30.1428570.1428576.5714297
\n", + "

65 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " is_correct \\\n", + "agent_name task \n", + "code_gpt4o_03_february_goodoldtext-unbroken 1 0.452830 \n", + " 2 0.380952 \n", + " 3 0.227273 \n", + "code_gpt4o_03_february_magenticbrowser 1 0.480769 \n", + " 2 0.349398 \n", + "... ... \n", + "code_o3-mini_03_february_remove-navigational 2 0.232558 \n", + " 3 0.153846 \n", + "code_qwen-coder-32B_03_february_text 1 0.357143 \n", + " 2 0.136364 \n", + " 3 0.142857 \n", + "\n", + " is_near_correct \\\n", + "agent_name task \n", + "code_gpt4o_03_february_goodoldtext-unbroken 1 0.452830 \n", + " 2 0.392857 \n", + " 3 0.227273 \n", + "code_gpt4o_03_february_magenticbrowser 1 0.480769 \n", + " 2 0.361446 \n", + "... ... \n", + "code_o3-mini_03_february_remove-navigational 2 0.244186 \n", + " 3 0.153846 \n", + "code_qwen-coder-32B_03_february_text 1 0.357143 \n", + " 2 0.136364 \n", + " 3 0.142857 \n", + "\n", + " count_steps count \n", + "agent_name task \n", + "code_gpt4o_03_february_goodoldtext-unbroken 1 7.000000 53 \n", + " 2 8.511905 84 \n", + " 3 10.409091 22 \n", + "code_gpt4o_03_february_magenticbrowser 1 7.153846 52 \n", + " 2 8.168675 83 \n", + "... ... ... \n", + "code_o3-mini_03_february_remove-navigational 2 4.976744 86 \n", + " 3 6.615385 26 \n", + "code_qwen-coder-32B_03_february_text 1 5.428571 14 \n", + " 2 6.409091 22 \n", + " 3 6.571429 7 \n", + "\n", + "[65 rows x 4 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(\"Average score:\", sel_df.groupby(\"agent_name\")[[\"is_correct\"]].mean().round(3))\n", + "display(\n", + " sel_df.groupby([\"agent_name\", \"task\"])[[\"is_correct\", \"is_near_correct\", \"count_steps\", \"question\"]]\n", + " .agg(\n", + " {\n", + " \"is_correct\": \"mean\",\n", + " \"is_near_correct\": \"mean\",\n", + " \"count_steps\": \"mean\",\n", + " \"question\": \"count\",\n", + " }\n", + " )\n", + " .rename(columns={\"question\": \"count\"})\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "customdata": [ + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "According to github, when was Regression added to " + ], + [ + "I’m researching species that became invasive after" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "When you take the average of the standard populati" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "What is the last word before the second chorus of " + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "What is the latest chronological year date written" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "What was the complete title of the book in which t" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "As of August 2023, who is the only winner of the U" + ] + ], + "hovertemplate": "agent_name=code_gpt4o_03_february_goodoldtext-unbroken
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_gpt4o_03_february_goodoldtext-unbroken", + "line": { + "color": "#636efa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_gpt4o_03_february_goodoldtext-unbroken", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4A", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoP1VVVVVVVeU/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkPzMzMzMzM+M/AAAAAAAA4j/T0tLS0tLiP3Icx3Ecx+E/bCivobyG4j+amZmZmZnhP5IkSZIkSeI/dNFFF1104T8LWchCFrLgP1VVVVVVVeE/7FG4HoXr4T+xEzuxEzvhP3sJ7SW0l+A/AAAAAAAA4D/d0wjLPY3gPxEREREREeE/hBBCCCGE4D8AAAAAAADgPwgffPDBB98/AAAAAAAA4D9f8RVf8RXfP47jOI7jON4/fdYNpshn3T8bymsor6HcP1y+5Vu+5ds/zczMzMzM3D8ZnI/B+RjcPz3P8zzP89w/EnfEHXFH3D+jiy666KLbPxzHcRzHcdw/05ve9KY33T94Nuo7G/XdP1VVVVVVVd0/L6fg5RS83D9xPQrXo3DdP93c3Nzc3Nw/7MRO7MRO3D8iNcF4K/vcPxPaS2gvod0/F1100UUX3T8lSZIkSZLcPx/BfQT3Edw/GmG5pxGW2z91Xx5bETTcP7y7u7u7u9s/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z9bqZVaqZXaPyebbLLJJts/NSbSA5Wz2z88PDw8PDzcP8y1A3PtwNw/fMVXfMVX3D8LmwOJVtjcPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbP08b6LSBTts/KK+hvIby2j++Y2pg75jaPxqkQRqkQdo/2TMQlY7s2T+amZmZmZnZP+Dp1vywSNk/+hicj8H52D+q82sPuazYP0mSJEmSJNk/2djY2NjY2D9T1pQ1ZU3ZPzv0m61Dv9k/L7rooosu2j+e8YxnPOPZP5qZmZmZmdk/WqAFWqAF2j+c3vSmN73ZP3bZZZdddtk/Z6O+s1Hf2T+amZmZmZnZPwAAAAAAANo/Grab5Ulk2j+IxvrQWB/aPywFav1Kgdo/PQrXo3A92j8ZvhEFJp3aP/v6+vr6+to/G0PTHey32j87sRM7sRPbP9u2bdu2bds/ln0OqQnG2z8T6J26loPbPya0l9BeQts/JrBpP1kC2z/D2jesfcPaP5ax/Y5eGds/27Zt27Zt2z80+bJBky/bPyivobyG8to/q8FzBIq22j8+jbDc0wjbP5u1WZu1Wds/BA0ndV8e2z+bCOSaCOTaPzMzMzMzM9s/hYn3I6f52j+f4pIhWEfbPw8b6bCRDts/W2uttdZa2z/ZzvdT46XbP/y+7/u+79s/7na73W632z8AAAAAAADcP/KGvCFvyNs/HLmRG7mR2z8j+oDq2FvbPyebbLLJJts/27Zt27Zt2z9YYyI9UDnbP1uwBVuwBds/09LS0tLS2j/TVwljs6DaP6c3velNb9o/D+jGPH202j87qIM6qIPaP2le/ImEU9o/gkQrbA4k2j9r/N08QvXZP3Icx3Ecx9k/mpmZmZmZ2T/Lli1btmzZP2x21CLkr9k/I591gyny2T9SkPx5lcXZP5qZmZmZmdk/y7hl3DJu2T82lNdQXkPZP9ouhNkuhNk/EWflJ8RZ2T8wWf6S5S/ZP2mQBmmQBtk/fo/ICcLd2D9rcRPmd7XYP3bpMX+vjdg/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "When you take the average of the standard populati" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "According to github, when was Regression added to " + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "What is the latest chronological year date written" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Of the cities within the United States where U.S. " + ] + ], + "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_gpt4o_03_february_magenticbrowser", + "line": { + "color": "#EF553B", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_gpt4o_03_february_magenticbrowser", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4A", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACamZmZmZnJP1VVVVVVVcU/kiRJkiRJwj8AAAAAAADAPxzHcRzHcbw/mpmZmZmZyT900UUXXXTRPwAAAAAAANA/FDuxEzux0z+3bdu2bdvWP1VVVVVVVdU/AAAAAAAA1D/T0tLS0tLSP3Icx3Ecx9E/eQ3lNZTX0D8AAAAAAADQP5IkSZIkSdI/dNFFF1100T84velNb3rTP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j8Jyz2NsNzTPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADUP1VVVVVVVdU/tbS0tLS01D/UQR3UQR3UP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP1VVVVVVVdU/ZmZmZmZm1j/blahdidrVP7dt27Zt29Y/lTVlTVlT1j9GF1100UXXPxdswRZswdY/etOb3vSm1z9dQUyuICbXPwAAAAAAANg/4eUUvJyC1z8K16NwPQrXP9jX19fX19c/J3ZiJ3Zi1z9ln0NqgvHWP0xoL6G9hNY/RhdddNFF1z+3bdu2bdvWP0xnMZ3FdNY/fBphuacR1j/QcFL35bHVP1VVVVVVVdU/yRCso8371D+ttdZaa63VP1VVVVVVVdU/AAAAAAAA1T/VSq3USq3UP1VVVVVVVdU/0gOVs1v41T+mpaWlpaXVP1VVVVVVVdU/Fl/xFV/x1T8g0QqbA4nWP47jOI7jONY/r169evXq1T/JZ91ginzWPwrXo3A9Ctc/ymsor6G81j8oxFn5CXHWP3ZiJ3ZiJ9Y/Xi1uwvyu1j9mZmZmZmbWP6QMPN2aH9Y/25WoXYna1T80dX7tIZfVP1VVVVVVVdU/FRUVFRUV1T82ZU1ZU9bUPy+QSfECmdQ/XXTRRRdd1D9CEYpQhCLUP5Q+6ZM+6dM/VEZlVEZl1D9DFrKQhSzUP6WUUkoppdQ/Ut/ZqO9s1D8mTv2eW+LUP1VVVVVVVdU/1g86KvDF1T9jfWisD43VP1DrVwrU+tU/w/UoXI/C1T+bB7nrZ4vVP/b19fX19dU/2xia7mC/1T+e2Imd2InVP1VVVVVVVdU/2eeQmmC81T/XcnCzX4jVP9FeQnsJ7dU//mQJbNpP1j8c1r5h7RvWP49eGdvv6NU/btu2bdu21T96amGlpxbWP0xnMZ3FdNY/bTV4jkDR1j9Y7mmE5Z7WP9ZmbdZmbdY/QcNJ3ZfH1j/XRCDXRCDXP3d3d3d3d9c/RhdddNFF1z8RrKPN+xTXP+UWT27x5NY/Ouecc8451z8K16NwPQrXP9d1Xdd1Xdc/7PV6vV6v1z8AAAAAAIDXP/QFfUFf0Nc/GHqhF3qh1z/f2jDNXfDXP8IHH3zwwdc/9oDZA2YP2D9JD1TObuHXP0J7Ce0ltNc/iIeHh4eH1z82C6o9J9PXP4K5dmCuHdg/6qPVJETx1z+ogzqogzrYP2C3x1qGDtg/Zfx2qSfj1z/MknJAZLjXP+Q4juM4jtc/J0p2baJk1z+6c+fOnTvXP+HlFLycgtc/n3WDKfJZ1z99GzBU0zHXP3d3d3d3d9c/uj5dn65P1z+H8hrKayjXP1esAVesAdc/t23btm3b1j+21lprrbXWPwdpkAZpkNY/dRhlKp5r1j9eLW7C/K7WP+EMCCV3itY/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "When you take the average of the standard populati" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "According to github, when was Regression added to " + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "What is the latest chronological year date written" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "What was the actual enrollment count of the clinic" + ] + ], + "hovertemplate": "agent_name=code_gpt4o_03_february_magenticbrowser2
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_gpt4o_03_february_magenticbrowser2", + "line": { + "color": "#00cc96", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_gpt4o_03_february_magenticbrowser2", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T+SJEmSJEniPxEREREREeE/AAAAAAAA4D8eHh4eHh7ePwAAAAAAAOA/DeU1lNdQ3j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP1VVVVVVVd0/KVyPwvUo3D87sRM7sRPbPy+hvYT2Eto/27Zt27Zt2z9huacRlnvaP5qZmZmZmdk/11prrbXW2j8AAAAAAADcPxdddNFFF90/PDw8PDw83D/btm3btm3bP6uqqqqqqto/0LrBFPms2z8or6G8hvLaPxqkQRqkQdo/mpmZmZmZ2T+J2pWoXYnaP9u2bdu2bds/EnfEHXFH3D8XXXTRRRfdPxzHcRzHcdw/velNb3rT2z9yBTG5gpjcP1VVVVVVVd0/g5dT8HIK3j9xPQrXo3DdP93c3Nzc3Nw/7MRO7MRO3D8iNcF4K/vcPxzHcRzHcdw/7RvWvmHt2z8lSZIkSZLcPx/BfQT3Edw/1AjLPY2w3D91Xx5bETTcP83MzMzMzNw/532KS4Zg3T/nnHPOOefcP13XdV3Xdd0/AAAAAAAA3T/dyI3cyI3cPxdddNFFF90/rDGRHqic3T8tLS0tLS3dP8y1A3PtwNw/fMVXfMVX3D8yfrvUk/HbPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbP08b6LSBTts/KK+hvIby2j/btm3btm3bP1y+5Vu+5ds/FzdhfleL2z8zMzMzMzPbP35YpAw83do/idqVqF2J2j/ksmKghDfaP3qe53me59k/mpmZmZmZ2T9T1pQ1ZU3ZPxUvkEnxAtk/dNFFF1102T+TlaxkJSvZP5qZmZmZmdk/GZVRGZVR2T+RhSxkIQvZP3bZZZdddtk/5QpicgUx2T+amZmZmZnZP1VVVVVVVdk/mYbtZnkS2T801ofG+tDYPzbZZJNNNtk/9ihcj8L12D+qeZC7frbYPxkZGRkZGdk/i/gEUsl52T+xEzuxEzvZP5qZmZmZmdk/fg6pCcZb2T/7hVhRGh/ZPzmO4ziO49g/koq51Rmp2D9wWPuGtW/YP6+M7Xf0ytg/JUmSJEmS2D/pqYWVnlrYPzqL6Syms9g/iHG/Lql82D/LPY2w3NPYP9mJndiJndg/6r48tiJo2D9YoTNYoTPYPwAAAAAAANg/Kky8HznN1z/jkiFYR5vXP2pXonYlatc/vvfee++91z+q8dJNYhDYP/h93/d939c/DAaDwWAw2D8AAAAAAADYPxT2hD1hT9g/+IEf+IEf2D/pA6pjb23YPz744IMPPtg/qYilIpaK2D8m0gOVs1vYP9iCLdiCLdg/AAAAAAAA2D9Q7TmZvkrYP4mfUeJnlNg/TGV71wHd2D/5iq/4iq/YP2JyBTG5gtg/0QqbA4lW2D/ZiZ3YiZ3YPzmO4ziO49g/0nmLIZ232D/EiBEjRozYPzTWh8b60Ng/YYp81g2m2D/oVRZntHvYP1K4HoXrUdg/waJgUbAo2D8AAAAAAADYP9jX19fX19c/1cDeMTWw1z+JV5F4FYnXPyd2Yid2Ytc/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "When you take the average of the standard populati" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "According to github, when was Regression added to " + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "What is the latest chronological year date written" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "What was the complete title of the book in which t" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "According to the USGS, in what year was the Americ" + ] + ], + "hovertemplate": "agent_name=code_gpt4o_03_february_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_gpt4o_03_february_text", + "line": { + "color": "#ab63fa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_gpt4o_03_february_text", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgP97d3d3d3d0/AAAAAAAA3D9aWlpaWlraPzmO4ziO49g/KK+hvIby2j+amZmZmZnZP9u2bdu2bds/L7rooosu2j+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T/ZiZ3YiZ3YPy+hvYT2Eto/27Zt27Zt2z/UCMs9jbDcP7y7u7u7u9s/55xzzjnn3D8AAAAAAADcPxdddNFFF90/PDw8PDw83D8d1EEd1EHdP47jOI7jON4/KvJZN5gi3z8N5TWU11DeP9/yLd/yLd8/AAAAAAAA4D84H4PzMTjfPwAAAAAAAOA/0Bf0BX1B3z8AAAAAAADgP5/0SZ/0Sd8/6k1vetOb3j94Nuo7G/XdP1VVVVVVVd0/L6fg5RS83D8pXI/C9SjcP93c3Nzc3Nw/7MRO7MRO3D+WfQ6pCcbbPya0l9BeQts/7RvWvmHt2z8lSZIkSZLcPx/BfQT3Edw/1AjLPY2w3D91Xx5bETTcP7y7u7u7u9s/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z8cuZEbuZHbPx988MEHH9w/NSbSA5Wz2z9LS0tLS0vbP73pTW9609s/27Zt27Zt2z8yfrvUk/HbP+Q4juM4jts/2bJly5Yt2z/QusEU+azbP08b6LSBTts/KK+hvIby2j++Y2pg75jaPxqkQRqkQdo/2TMQlY7s2T9mZmZmZmbaPy+hvYT2Eto/idqVqF2J2j+CEt5o6vzaP6uqqqqqqto/WlpaWlpa2j+zpqwpa8raP2G5pxGWe9o/L7rooosu2j+e8YxnPOPZP/qkT/qkT9o/WqAFWqAF2j+c3vSmN73ZPyeaaKKJJto/Z6O+s1Hf2T+amZmZmZnZPwAAAAAAANo/Wp5EpmG72T+IxvrQWB/aPzFvZ0jM29k/mpmZmZmZ2T96kLt+tljZP7q5ubm5udk/i/gEUsl52T+KndiJndjZP5qZmZmZmdk/fg6pCcZb2T+B3qlrObjZP7SX0F5Ce9k/XJ2RirnV2T+amZmZmZnZP+mVsf2OXtk/btu2bdu22T+hyZcNmnzZPzGdxXQW09k/mpmZmZmZ2T+oEZZ7GmHZP1qbtVmbtdk/lLovj60I2j8arNAZrNDZPyIiIiIiIto/vB85zdfq2T/8FJcMwTraP4nalahdido/U0oppZRS2j/pJjEIrBzaP3qe53me59k/bTabzWaz2T8AAAAAAIDZP3PGnDFnzNk/mpmZmZmZ2T8GfxUnpOTZP7LJJptsstk/wp8Jfyb82T+/GhPpgcrZP5qZmZmZmdk/aWlpaWlp2T+fk+mrhLHZP6BR4meU+Nk/rSYhir/I2T+amZmZmZnZP2bogN0ea9k/FjYHEq2w2T/lgMhwr4LZP3Icx3Ecx9k/famg1ZcK2j/SpEmTJk3aP4jG+tBYH9o/DqbIZ91g2j8h+fMqizPaPwc6baDTBto/z2pntbPa2T/zGsprKK/ZP9ouhNkuhNk/EWflJ8RZ2T8wWf6S5S/ZP2mQBmmQBtk/fo/ICcLd2D9rcRPmd7XYP3bpMX+vjdg/ZmZmZmZm2D+vUkzQXaXYP5Ey8HRrftg/GFuCb/NX2D8yOB+D8zHYPwyYxoBpDNg/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "I’m researching species that became invasive after" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "According to github, when was Regression added to " + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "When you take the average of the standard populati" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "What is the area of the green polygon in the attac" + ] + ], + "hovertemplate": "agent_name=code_llama-3
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_llama-3", + "line": { + "color": "#FFA15A", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_llama-3", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZ", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEREREREbE/AAAAAAAAsD8eHh4eHh6uPxzHcRzHcaw/KK+hvIbyqj+amZmZmZmpPxiGYRiGYag/RhdddNFFpz9kIQtZyEKmP1VVVVVVVaU/exSuR+F6pD8UO7ETO7GjP2gvob2E9qI/kiRJkiRJoj+WexphuaehPxEREREREaE/hBBCCCGEoD8AAAAAAACgPwgffPDBB58/Hh4eHh4erj8d1EEd1EGtPxzHcRzHcaw/0LrBFPmsqz8or6G8hvKqPxqkQRqkQao/MzMzMzMzsz+7ErUrUbuyP5IkSZIkSbI/d8QdcUfcsT900UUXXXSxPxEREREREbE/ZCELWchCtj9XEJMriMm1P1VVVVVVVbU/OQUvp+DltD97FK5H4Xq0PxQUFBQUFLQ/FDuxEzuxsz/BeCv7HFKzP2gvob2E9rI/nhLkKUGesj+SJEmSJEmyP3AfwX0E97E/fBphuacRtj/QcFL35bG1P1VVVVVVVbU/yRCso837tD/GGGOMMca4PxiGYRiGYbg/AAAAAAAAuD8YeqEXeqG3P0YXXXTRRbc/jYn0QOXstj+XlpaWlpa2P2QhC1nIQrY/Fl/xFV/xtT9ItMLmQKK1P1VVVVVVVbU/qFChQoUKtT8cTJHPusG0P3sUrkfherQ/XkN5DeU1tD/Oyk+Is/KzP5dv+ZZv+bY/Xi1uwvyutj9mZmZmZma2P6QMPN2aH7Y/25WoXYnatT80dX7tIZe1P1VVVVVVVbU/FRUVFRUVtT82ZU1ZU9a0Py+QSfECmbQ/XXTRRRddtD9CEYpQhCK0P5Q+6ZM+6bM/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "When you take the average of the standard populati" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "According to github, when was Regression added to " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "What is the latest chronological year date written" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "The brand that makes these harnesses the dogs are " + ] + ], + "hovertemplate": "agent_name=code_o1_01_february_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_01_february_text", + "line": { + "color": "#19d3f3", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_01_february_text", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA6D+amZmZmZnpP1VVVVVVVeU/t23btm3b5j8AAAAAAADoP1VVVVVVVeU/ZmZmZmZm5j9ddNFFF13kP6uqqqqqquI/FDuxEzux4z+SJEmSJEniPxEREREREeE/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP6uqqqqqqto/KVyPwvUo3D+e2Imd2IndPxzHcRzHcdw/btu2bdu23T9HWO5phOXePwAAAAAAAOA/hBBCCCGE4D8AAAAAAADgP3zwwQcffOA/AAAAAAAA4D9QB3VQB3XgPzmO4ziO4+A/whT5rBtM4T95DeU1lNfgP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/R9wRd8Qd4T900UUXXXThPxEREREREeE/C1nIQhay4D/E5ApicgXhP6uqqqqqquA/FbycgpdT4D+kcD0K16PgP/Hw8PDw8OA/sRM7sRM74T9vZZ9DaoLhP3Icx3Ecx+E/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhPxEREREREeE/DcE62rxP4T+MMcYYY4zhP1EURVEUReE/AAAAAAAA4T/RC73QC73gP/jggw8++OA/TKQHKme34D/x8PDw8PDgPxM/o8TPKOE/8RVf8RVf4T8OJFphcyDhPzmO4ziO4+A/iREjRowY4T/CFPmsG0zhPxEREREREeE/NpTXUF5D4T/lJ8RZ+QnhP7ETO7ETO+E/BqLSkT0D4T8zMzMzMzPhPyNl4OnW/OA/LFG7ErUr4T9T59ceclnhP0mSJEmSJOE/8fDw8PDw4D8w6Av6gr7gP93TCMs9jeA/XXTRRRdd4D8DF7jABS7gPwAAAAAAAOA/0AIt0AIt4D+GLGQhC1ngP4QQQgghhOA/QUyuICZX4D+yAmGkHSvgP1VVVVVVVeA/8MXVDzoq4D8VvJyCl1PgP3+lQK1fKeA/UrgehetR4D8cUWDSqXngP6GgoKCgoOA/9lttDE134D/sxE7sxE7gP3ACJ3ACJ+A/463sc0hN4D8hVpTGRybgPwAAAAAAAOA/WQKb9pMl4D8AAAAAAADgP04CcaHmJOA/kiRJkiRJ4D/3QwJvPyTgP34E9xHcR+A/AkVbDZ4j4D/uaYTlnkbgPzACIzACI+A/AAAAAAAA4D/gKLvfKLvfP3d3d3d3d98/jmVQKky83z8uGYJ1tHnfPzgfg/MxON8/+N5777333j8IrBxaZDvfP7/v+77v+94/0Ofz+Xw+3z8AAAAAAIDfP/AH/AF/wN8/IPiBH/iB3z97a8M0d8HfP4QPPvjgg98/c/TN0TdH3z9FeqBydgvfP5/0SZ/0Sd8/Dw8PDw8P3z/ZLKj2nEzfP/EzSvyMEt8/Rs6w4FLZ3j9f8RVf8RXfP31no76zUd8/lPHbpZ6M3z89QvWZtsbfPwAAAAAAAOA/Dnj84YDH3z8AAAAAAADgP/LX7KhFyN8/AAAAAAAA4D+ZS4QnBcnfPwAAAAAAAOA//iZ/k7/J3z8AAAAAAADgPyB1yh91yt8/cVZ+QpyV3z9hHxf2cWHfP9/yLd/yLd8/PiInCHdj3z9hfleLmzDfPzqkJhhvZd8/mpmZmZmZ3z+P5g82Hs3fP1ikDDzdmt8/sxpFHDpp3z+cj8H5GJzfP2vfsPYNa98/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "According to github, when was Regression added to " + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "When you take the average of the standard populati" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "What is the latest chronological year date written" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "In the film Goldfinger, what color was the object " + ] + ], + "hovertemplate": "agent_name=code_o1_03_february_ablation-toolcalling-manager
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_03_february_ablation-toolcalling-manager", + "line": { + "color": "#FF6692", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_03_february_ablation-toolcalling-manager", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAAAAAFVVVVVVVdU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/ntiJndiJ3T/btm3btm3bP97d3d3d3d0/AAAAAAAA3D9aWlpaWlraPxzHcRzHcdw/KK+hvIby2j/NzMzMzMzcP57neZ7ned4/F1100UUX3T+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T87sRM7sRPbPxzHcRzHcdw/27Zt27Zt2z9huacRlnvaP7y7u7u7u9s/11prrbXW2j8AAAAAAADcPyebbLLJJts/WlpaWlpa2j/btm3btm3bP6uqqqqqqto/I591gyny2T8or6G8hvLaP1y+5Vu+5ds/MzMzMzMz2z+J2pWoXYnaP3qe53me59k/s6asKWvK2j8vuuiiiy7aP1uwBVuwBds/velNb3rT2z9t1Hc26jvbP6uqqqqqqto/iMb60Fgf2j+amZmZmZnZPxkZGRkZGdk/2Ymd2Imd2D9+DqkJxlvZPy+hvYT2Eto/mpmZmZmZ2T9JkiRJkiTZPzqL6Syms9g/7mmE5Z5G2D+yFUHDSd3XP3d3d3d3d9c/EayjzfsU1z+21lprrbXWP5ZlWZZlWdY/AAAAAAAA1z9XaqVWaqXWP0422WSTTdY/jYn0QOXs1j+Ih4eHh4fXP3PtwFw7MNc/t23btm3b1j8g0QqbA4nWP47jOI7jONY/r169evXq1T/yWTeYIp/VPzCW/GLJL9Y/UV5DeQ3l1T8KcVZ+QpzVP3ZiJ3ZiJ9Y/v6vFTZjf1T9mZmZmZmbWP/PDImXg6dY/onYlalei1j/S1Pm1h1zWP4ZhGIZhGNY/1tXV1dXV1T9lTVlT1pTVP1VVVVVVVdU/F1100UUX1T9ObWpTm9rUP/VJn/RJn9Q/lVEZlVEZ1T9Ob3rTm97UP1VVVVVVVdU/1Hc26jsb1T8mTv2eW+LUP1VVVVVVVdU/Ffji6gcd1T85BS+n4OXUP1VVVVVVVdU/H4XrUbge1T+bB7nrZ4vVP/b19fX19dU/Iz6BVHJe1j92Yid2YifWP9dojdZojdY/n0NqgvFW1j9dy8HNfiHWP0xoL6G9hNY/Y251Rirm1j+x9g1r37DWPwJxoeYkENc/t23btm3b1j9WemphpafWP0xnMZ3FdNY/ZCELWchC1j9Y7mmE5Z7WP9ZmbdZmbdY/CRpO6r481j9W6AxW6AzWP2ZmZmZmZtY/fa2eHQI31j9t3qe4ZAjWP2DW+2W9X9Y/MsYYY4wx1j+6SQwCK4fWP7dt27Zt29Y/q9VqtVqt1j8AAAAAAIDWP7UlbUlb0tY/N3IjN3Ij1z/LiD6gOvbWP8omm2yyydY/3Wl1p9Wd1j+NifRA5ezWP61z5QHJOtc/iIeHh4eH1z8cKRrij1vXP3PtwFw7MNc/iOIvcoYF1z+3bdu2bdvWP1uGDtjtsdY/INEKmwOJ1j/Am0eoPtPWP6uqqqqqqtY/QzpvMaTz1j+2bNmyZcvWP6lFyF+zo9Y/yWfdYIp81j+vsjij3cPWP5020GkDndY/tNpZ7ax21j8N5TWU11DWP9aAK9aAK9Y/mRrYO6YG1j/iVSReReLVP3ZiJ3ZiJ9Y/SS9/2kID1j+/q8VNmN/VP9nnkJpgvNU/mpmZmZmZ1T+hu0oxQXfVP1VVVVVVVdU/0j5IBtQz1T8TtStRuxLVP/KUIE8J8tQ/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "Use density measures from the chemistry materials " + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "When you take the average of the standard populati" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "According to github, when was Regression added to " + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "What is the latest chronological year date written" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "The attached spreadsheet contains a list of books " + ] + ], + "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_03_february_fix-print-outputs", + "line": { + "color": "#B6E880", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_03_february_fix-print-outputs", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAA==", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9GF1100UXnP1VVVVVVVeU/dmIndmIn5j+3bdu2bdvmP1VVVVVVVeU/AAAAAAAA5j+1tLS0tLTkP1VVVVVVVeU/UV5DeQ3l5T9mZmZmZmbmP1VVVVVVVeU/XXTRRRdd5D9Ob3rTm97kPwAAAAAAAOQ/MzMzMzMz4z9iJ3ZiJ3biP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/jDHGGGOM4T8AAAAAAADhP3TRRRdddOE/4uHh4eHh4T+SJEmSJEniP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhP2IYhmEYhuE/d8QdcUfc4T8vuuiiiy7iP9InfdInfeI/IQtZyEIW4j9HfWejvrPhPwAAAAAAAOI/aKwPjfWh4T9I4XoUrkfhP5KRkZGRkeE/sRM7sRM74T9vZZ9DaoLhP3Icx3Ecx+E/dNFFF1104T9JkiRJkiThP3UW01lMZ+E/uacRlnsa4T/VfXlsRdDgPxEREREREeE/DcE62rxP4T8IIYQQQgjhPzEMwzAMw+A/AAAAAAAA4T/RC73QC73gP3zwwQcffOA/TKQHKme34D94eHh4eHjgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhP36x5BdLfuE/8xrKayiv4T8De8fUwN7hP9IgDdIgDeI/pSN7BqLS4T+amZmZmZnhP8rA0635YeE/LFG7ErUr4T9T59ceclnhP0mSJEmSJOE/UVFRUVFR4T9f0Bf0BX3hP5Z7GmG5p+E/dNFFF1104T8UoQhFKELhPxEREREREeE/sRM7sRM74T8WspCFLGThPzTRRBNNNOE/BTG5gphc4T9BGGnHCoThP6uqqqqqquE/UoEvrn7Q4T99aKwPjfXhP29nSMzbGeI/PQrXo3A94j+LleEbUWDiPzIyMjIyMuI/Kjkvi/gE4j92Yid2YifiP7If+7Ef++E/UhOMt7LP4T+zX4gVpfHhP3Icx3Ecx+E/1hmpmFud4T+/Ye0b1r7hP18Z2+/oleE/btu2bdu24T+c6xjFuY7hP/Maymsor+E/HYGirQbP4T+E5Z5GWO7hP9IgDdIgDeI/RdBwUvfl4T9Sdr9Rdr/hP97d3d3d3eE/WQalwsT74T/ep7hkCNbhP/QxOB+D8+E/zjnnnHPO4T9Ei2zn+6nhP2IYhmEYhuE/aTQajUaj4T8AAAAAAMDhP2fMGXPGnOE/oRd6oRd64T9gxQkpeZbhP3TRRRdddOE/LBWxVMRS4T8qZ7fwqzHhP9wUo4a/TeE/aWlpaWlp4T/Ircs74EjhPxaykIUsZOE/P1pNQhR/4T+amZmZmZnhP8afSDileeE/RStsDiRa4T+xEzuxEzvhP8dxHMdxHOE/smsTJbs24T+JESNGjBjhPz801ofG+uA/TJHPusEU4T83YKimYy7hP0jhehSuR+E/ianEVGIq4T/YUF5DeQ3hP9F7JtF7JuE/5SfEWfkJ4T/uUN0O1e3gPyEN0iAN0uA/DVjSy5+24D+fgah0ZM/gP2dAKLlTtOA/mpmZmZmZ4D+aP9h4NH/gP6hb88MiZeA/axRx6KR94D+WqF2J2pXgPw==", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "When you take the average of the standard populati" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "According to github, when was Regression added to " + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "What was the complete title of the book in which t" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "What is the latest chronological year date written" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "As of the 2020 census, what was the population dif" + ] + ], + "hovertemplate": "agent_name=code_o1_03_february_fix-print-outputs2
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_03_february_fix-print-outputs2", + "line": { + "color": "#FF97FF", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_03_february_fix-print-outputs2", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA6D+amZmZmZnpP6uqqqqqquo/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/mpmZmZmZ6T9GF1100UXnP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkP1VVVVVVVeU/AAAAAAAA5j+1tLS0tLTkP+Q4juM4juM/bCivobyG4j8zMzMzMzPjP/Q8z/M8z+M/6aKLLrro4j84velNb3rjP6uqqqqqquI/MzMzMzMz4z8UO7ETO7HjP2gvob2E9uI/27Zt27Zt4z8Jyz2NsNzjP0REREREROQ/nXPOOeec4z8AAAAAAADjP2WTTTbZZOM/09LS0tLS4j8zMzMzMzPjP+Q4juM4juM/bzBFPusG4z/lNZTXUF7jPxQ7sRM7seM/AAAAAAAA5D9L1K5E7UrkPyVJkiRJkuQ/NmVNWVPW5D9ddNFFF13kP/VJn/RJn+Q/QxaykIUs5D/PRn1no77jPwAAAAAAAOQ/5hS8nIKX4z8zMzMzMzPjP3Nzc3Nzc+M/O7ETO7ET4z/BeCv7HFLjP2gvob2E9uI/MzMzMzMz4z+3bdu2bdviP2wor6G8huI/c08jLPc04j9+eWxF0HDiP6uqqqqqquI/sI4271Nc4j+VUkoppZTiP7Msy7Isy+I/AAAAAAAA4z8zMzMzMzPjP+miiy666OI/w6/GRHqg4j/T0tLS0tLiPzDXDsy1A+M/MzMzMzMz4z+/XerJ+O3iP6uqqqqqquI/kyZNmjRp4j+DKfJZN5jiP8aSXyz5xeI/bCivobyG4j+SJEmSJEniP9IgDdIgDeI/dWTPQFQ64j9mZmZmZmbiP8HTrflhkeI/uxK1K1G74j+Ops6vPeTiP8MwDMMwDOM/09LS0tLS4j+/oC/oC/riP+MFMileIOM/6aKLLrro4j8xhznMYQ7jPzMzMzMzM+M/0y/90i/94j+ykIUsZCHjP+2yyy677OI/TK4gJlcQ4z/PLXHq99ziP6uqqqqqquI/8yQyDdvN4j+n4OUUvJziP2r9SoFav+I/4XoUrkfh4j/zIHf9bLHiP4OCgoKCguI/cl4W8Qmk4j9iJ3ZiJ3biP5IkSZIkSeI/NcF4K/sc4j/2C7GiND7iP+0ltJfQXuI/OyMVc6sz4j9UgjwlyFPiPzUngbhQc+I/kiRJkiRJ4j94+yGBtx/iP+4juI/gPuI/IQtZyEIW4j9zTyMs9zTiP9IgDdIgDeI/4qTuy2Mr4j+SJEmSJEniP2ZmZmZmZuI/y6BUmHg/4j9Hm/cpLhniP/QxOB+D8+E/zjnnnHPO4T/sUbgehevhP3Icx3Ecx+E/aTQajUaj4T8AAAAAAMDhP3fEHXFH3OE/gh/4gR/44T/lWUb0AdXhP/DBBx988OE/3xx9c/TN4T92C78aE+nhPz0gWefKA+I/Hh4eHh4e4j/8cevyDjjiPyV+RomfUeI/oBvz9NFq4j+SJEmSJEniP8oVxOQKYuI/U0/Gb5d64j9EhnsVzJLiPxzHcRzHceI/bBMluzZR4j8SI0aMGDHiP5IkSZIkSeI/mCKfdYMp4j/TMZcITwriPyIiIiIiIuI/kuZIc6Q54j+vobyG8hriP1Kn/FGn/OE/y0+Is/IT4j/2cWEfF/bhP4qd2Imd2OE/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "When you take the average of the standard populati" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "According to github, when was Regression added to " + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "What is the last word before the second chorus of " + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "What is the minimum number of page links a person " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "What was the complete title of the book in which t" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "What is the latest chronological year date written" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "Using the Biopython library in Python, parse the P" + ] + ], + "hovertemplate": "agent_name=code_o1_03_february_goodoldtext-unbroken
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_03_february_goodoldtext-unbroken", + "line": { + "color": "#FECB52", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_03_february_goodoldtext-unbroken", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAA==", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/ZmZmZmZm5j9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j+3bdu2bdvmP3d3d3d3d+c/AAAAAAAA5j+XlpaWlpbmP8dxHMdxHOc/UV5DeQ3l5T9mZmZmZmbmP7dt27Zt2+Y/0UUXXXTR5T9Ob3rTm97kP1VVVVVVVeU/w/UoXI/C5T/FTuzETuzkP1VVVVVVVeU/btu2bdu25T98GmG5pxHmP1VVVVVVVeU/rbXWWmut5T8AAAAAAADlP1VVVVVVVeU/tbS0tLS05D91UAd1UAflPxzHcRzHceQ/HEyRz7rB5D/YUF5DeQ3lPzVIgzRIg+Q/zczMzMzM5D9L1K5E7UrkPyVJkiRJkuQ/NmVNWVPW5D9ddNFFF13kP/VJn/RJn+Q/QxaykIUs5D/PRn1no77jPwAAAAAAAOQ/5hS8nIKX4z/Xo3A9CtfjP3Nzc3Nzc+M/O7ETO7ET4z/7HFITjLfiP+0ltJfQXuI/CfKUIE8J4j9u27Zt27bhP3AfwX0E9+E/lnsaYbmn4T8NJ3VfHlvhP5qZmZmZmeE/3qe4ZAjW4T8RQgghhBDiP3Icx3Ecx+E/AAAAAAAA4j+SG7mRG7nhP/DBBx988OE/CCpnt/Cr4T/i4eHh4eHhPyELWchCFuI/kiRJkiRJ4j9TT8Zvl3riP47jOI7jOOI/kyZNmjRp4j+YIp91gyniP+xRuB6F6+E/r6G8hvIa4j8De8fUwN7hP9IgDdIgDeI/dWTPQFQ64j8AAAAAAADiPxl4ujU/LOI/9DE4H4Pz4T/xRlPn1x7iP5IkSZIkSeI/cnJycnJy4j+PuCPuiDviP7xAJsULZOI/jC666KKL4j8rWclKVrLiP4Mt2IIt2OI/0y/90i/94j+ykIUsZCHjP+2yyy677OI/C2JyBTG54j/PLXHq99ziP6uqqqqqquI/8yQyDdvN4j+8nIKXU/DiP2r9SoFav+I/4XoUrkfh4j9brAzfiALjPyMjIyMjI+M/FvEJpJLz4j9P7MRO7MTiP3Mpl3Ipl+I/GG9ln0Nq4j85uNkvxIriP+0ltJfQXuI/OyMVc6sz4j9UgjwlyFPiP5gin3WDKeI/AAAAAAAA4j+Kcx2jONfhP3AfwX0E9+E/IQtZyEIW4j+E5Z5GWO7hP3Icx3Ecx+E/RdBwUvfl4T9yTQRyTQTiP97d3d3d3eE/52v17BC44T/ep7hkCNbhP7GRDhvpsOE/zjnnnHPO4T9Ei2zn+6nhP2IYhmEYhuE/WSwWi8Vi4T8AAAAAAIDhP2fMGXPGnOE/khu5kRu54T9gxQkpeZbhP3TRRRdddOE/BhkXZFyQ4T8IKme38KvhP3Icx3Ecx+E/pqWlpaWl4T/ij1uXd8DhPxolfkaJn+E/l8r2rgO64T+amZmZmZnhP8afSDileeE/ezJ+u9ST4T900UUXXXThP+Q4juM4juE/pPMWQzpv4T+MGDFixIjhP1uE/DU7auE/whT5rBtM4T83YKimYy7hP0jhehSuR+E/ianEVGIq4T/YUF5DeQ3hP9F7JtF7JuE/rfyEOCs/4T8j8SoSryLhP7ETO7ETO+E/OUG4G/se4T8GotKRPQPhP+vSY/5eG+E/MzMzMzMz4T/ti6jW2RfhPw==", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "When you take the average of the standard populati" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "According to github, when was Regression added to " + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "I’m researching species that became invasive after" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "What is the minimum number of page links a person " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "What was the complete title of the book in which t" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "What is the latest chronological year date written" + ] + ], + "hovertemplate": "agent_name=code_o1_03_february_remove-navigational
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_03_february_remove-navigational", + "line": { + "color": "#636efa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_03_february_remove-navigational", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAA==", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/FDuxEzux4z+SJEmSJEniPzMzMzMzM+M/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j8AAAAAAADgPzEMwzAMw+A/dNFFF1104T8hC1nIQhbiP6uqqqqqquI/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/SZIkSZIk4T+WexphuafhPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP22yySabbOI/09LS0tLS4j+SJEmSJEniP3Icx3Ecx+E/mCKfdYMp4j/zGsprKK/hP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/GPQFfUFf4D+66KKLLrrgPxEREREREeE/FrKQhSxk4T/E5ApicgXhP1VVVVVVVeE/aKwPjfWh4T/sUbgehevhP5KRkZGRkeE/sRM7sRM74T+pCcZb2efgP/cS2ktoL+E/37D2DWvf4D9JkiRJkiThP3UW01lMZ+E/lnsaYbmn4T8NJ3VfHlvhP5qZmZmZmeE/DcE62rxP4T8IIYQQQgjhP1EURVEUReE/AAAAAAAA4T/RC73QC73gP/jggw8++OA/TKQHKme34D/x8PDw8PDgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhPxEREREREeE/eQ3lNZTX4D/lJ8RZ+QnhP7ETO7ETO+E/1uImzO9q4T8zMzMzMzPhPyNl4OnW/OA/yOB8DM7H4D+FN5o6v/bgP0mSJEmSJOE/UVFRUVFR4T9f0Bf0BX3hPwOZFC+QSeE/dNFFF1104T8UoQhFKELhP8EWbMEWbOE/UhmVURmV4T8WspCFLGThPzTRRBNNNOE/xOQKYnIF4T95DeU1lNfgP6uqqqqqquA/sd0sTyLT4D8qeDkFL6fgP3o7Q2LezuA/9ihcj8L14D/sZ4uV4RvhP0FBQUFBQeE/PoFUcl4W4T+xEzuxEzvhP/EVX/EVX+E/b2WfQ2qC4T9ws1+IFaXhP3Icx3Ecx+E/1hmpmFud4T900UUXXXThP8IU+awbTOE/27Zt27Zt4T+c6xjFuY7hP3UW01lMZ+E/FG01eI5A4T+oEZZ7GmHhP7ETO7ETO+E/cVL35bEV4T/x8PDw8PDgP83MzMzMzOA/HgI3lkGp4D/S5n2KS4bgP6cQaAqBpuA/xhhjjDHG4D+kcD0K16PgPzEMwzAMw+A/OBwOh8Ph4D8AAAAAAADhP0fcEXfEHeE/sRM7sRM74T9WnJCSZxnhP/jggw8++OA/UxFLRSwV4T+7hV+NifTgPxEREREREeE/8fDw8PDw4D+7vAOOFA3hPw/MtQNz7eA/jnn6aDUJ4T9JkiRJkiThP8TkCmJyBeE/DiRaYXMg4T+xEzuxEzvhP1VVVVVVVeE/pPMWQzpv4T8LFSpUqFDhP01c6d6AMuE/whT5rBtM4T+eFCR/XmXhP36x5BdLfuE/jVvGLeOW4T+U11BeQ3nhP5KRkZGRkeE/dNFFF1104T+MMcYYY4zhP0IapEEapOE/+x6RE4S74T8+A1HpyJ7hP29ln0NqguE/ZmZmZmZm4T9epZigu0rhP/cS2ktoL+E/fJu/wqxG4T8sUbsStSvhPw==", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "I’m researching species that became invasive after" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "According to github, when was Regression added to " + ], + [ + "When you take the average of the standard populati" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "What is the latest chronological year date written" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "What was the complete title of the book in which t" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "At the two-minute mark in the YouTube video upload" + ] + ], + "hovertemplate": "agent_name=code_o1_03_february_text_high-reasoning-effort
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_03_february_text_high-reasoning-effort", + "line": { + "color": "#EF553B", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_03_february_text_high-reasoning-effort", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVdU/AAAAAAAA4D+amZmZmZnZPwAAAAAAAOA/27Zt27Zt2z8AAAAAAADYPxzHcRzHcdw/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgP97d3d3d3d0/AAAAAAAA3D8eHh4eHh7ePxzHcRzHcdw/KK+hvIby2j+amZmZmZnZP9u2bdu2bds/L7rooosu2j+96U1vetPbP6uqqqqqqto/mpmZmZmZ2T/ZiZ3YiZ3YPy+hvYT2Eto/27Zt27Zt2z9huacRlnvaP5qZmZmZmdk/xhhjjDHG2D8AAAAAAADaPyebbLLJJts/PDw8PDw83D8d1EEd1EHdPxzHcRzHcdw/0LrBFPms2z8or6G8hvLaPxqkQRqkQdo/mpmZmZmZ2T+J2pWoXYnaP9u2bdu2bds/s6asKWvK2j+jiy666KLbP1uwBVuwBds/velNb3rT2z9t1Hc26jvbPwAAAAAAANw/27Zt27Zt2z/hehSuR+HaP5ybm5ubm9s/O7ETO7ET2z8KxlvZ55DaPya0l9BeQts/w9o3rH3D2j/btm3btm3bPx/BfQT3Edw/GmG5pxGW2z8EDSd1Xx7bP7y7u7u7u9s/Q7CONu9T3D/nnHPOOefcPxzHcRzHcdw/AAAAAAAA3D/dyI3cyI3cPx988MEHH9w/NSbSA5Wz2z9LS0tLS0vbP64dmGsH5to/27Zt27Zt2z8yfrvUk/HbPxzHcRzHcdw/4MCBAwcO3D/QusEU+azbPylcj8L1KNw/oryG8hrK2z/5CXFWfkLcP33Lt3zLt9w/VDqyZyAq3T/NzMzMzMzcP2t+WKQMPN0/qV2J2pWo3T/3kMuKgRLeP27btm3btt0/Hh4eHh4e3j9xR9wRd8TdPyCT4gUyKd4/jC666KKL3j/vda973evePz/pkz7pk94/3uM93uM93j/f9KY3vendP5dddtlll90/eDbqOxv13T9G2rECYaTdPwAAAAAAAN4/3ixPItOw3T+Dl1PwcgrePw2JeTtDYt4/uB6F61G43j/IXT9brAzfP19fX19fX98/FEgl52UR3z8ndmIndmLfPyD7sR/7sd8/OqQmGG9l3z83+4VYURrfPwntJbSX0N4/hOjxXTiI3j/WvmHtG9beP/DolbH9jt4/kiRJkiRJ3j9bWOmphZXePwnuI7iP4N4/6k1vetOb3j9HWO5phOXePx/qoR7qod4/VwQNJ3Vf3j9fzKdezKfeP2ZmZmZmZt4/4MYyKBUm3j+KS4ZgHW3ePy6e3OLJLd4/dM4555xz3j+4HoXrUbjeP57neZ7ned4/j8fj8Xg83j8AAAAAAADeP3FH3BF3xN0/ntiJndiJ3T9Ux97aMM3dP5NNNtlkk90/Wt1pdafV3T+K9EDl7BbeP97d3d3d3d0/Hh4eHh4e3j+kaIg/bl3eP+JnlPgZJd4/le1dB3Rj3j++4iu+4iveP3rxJxJOad4/u9ST8dul3j+qz7Q1/m7eP47jOI7jON4/Y0jnLYZ03j/16tWrV6/eP7o3oExc6d4/PusGU+Sz3j8uEZ4UJH/eP7gehetRuN4/+MJ74b3w3j+H8hrKayjfP59J9J5J9N4/4qz8hDgr3z/43nvvvffeP0/sxE7sxN4/EjlBuBv73j9hfleLmzDfPzqkJhhvZd8/mpmZmZmZ3z+P5g82Hs3fP1ikDDzdmt8/sxpFHDpp3z84H4PzMTjfPwgffPDBB98/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "When you take the average of the standard populati" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "What is the minimum number of page links a person " + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "According to github, when was Regression added to " + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "What is the latest chronological year date written" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "What is the average number of pre-2020 works on th" + ] + ], + "hovertemplate": "agent_name=code_o1_04_february_submission
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_04_february_submission", + "line": { + "color": "#00cc96", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_04_february_submission", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA8D+amZmZmZnpP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/AAAAAAAA4D900UUXXXThPwAAAAAAAOA/sRM7sRM74T+SJEmSJEniPxEREREREeE/AAAAAAAA4j/T0tLS0tLiP+Q4juM4juM/XkN5DeU15D/NzMzMzMzkP/Q8z/M8z+M/XXTRRRdd5D84velNb3rjP6uqqqqqquI/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/kiRJkiRJ4j9PIyz3NMLiPzMzMzMzM+M/lVJKKaWU4j8AAAAAAADjP22yySabbOI/09LS0tLS4j8zMzMzMzPjP6uqqqqqquI/bzBFPusG4z9sKK+hvIbiP9IgDdIgDeI/ZmZmZmZm4j+7ErUrUbviP5IkSZIkSeI/p6wpa8qa4j/poosuuujiP9InfdInfeI/IQtZyEIW4j9HfWejvrPhP1VVVVVVVeE/PzTWh8b64D9I4XoUrkfhP/Hw8PDw8OA/2Ymd2Imd4D+pCcZb2efgP3sJ7SW0l+A/SpCnBHlK4D8AAAAAAADgP34E9xHcR+A/3dMIyz2N4D+c1H15bEXgPwAAAAAAAOA/afM+xSVD4D+EEEIIIYTgPxAEQRAEQeA/AAAAAACA4D/RC73QC73gP3zwwQcffOA/b+FXYyI94D94eHh4eHjgPwtZyEIWsuA/oQ7qoA7q4D8OJFphcyDhP1VVVVVVVeE/jBgxYsSI4T/CFPmsG0zhPxEREREREeE/eQ3lNZTX4D9WfkKclZ/gP5AGaZAGaeA/N2F+V4ub4D/NzMzMzMzgP3sJ7SW0l+A/yOB8DM7H4D+2h1xWDJTgPxiGYRiGYeA/kZCQkJCQ4D8w6Av6gr7gP93TCMs9jeA/uuiiiy664D8Oc5jDHObgP2ELtmALtuA/cQiHcAiH4D+GLGQhC1ngPyywwAILLOA/QUyuICZX4D8WCCPtWIHgP1VVVVVVVeA/8MXVDzoq4D8VvJyCl1PgP3+lQK1fKeA/AAAAAAAA4D+YdGoe5K7fP19fX19fX98/XG0MTXew3z8ndmIndmLfPyD7sR/7sd8/AAAAAAAA4D+9U9dycLPfPwAAAAAAAOA/TvvJEti03z9r37D2DWvfPyryWTeYIt8/27Zt27Zt3z8SePshgbffPwT3EdxHcN8//HVJ5cO43z8jLPc0wnLfP6D7uZ/7ud8/yFYEDSd13z/gKLvfKLvfPwAAAAAAAOA/jmVQKky83z8AAAAAAADgPyHQFAJNIeA/AAAAAAAA4D9YObTIdr7fP9/3fd/3fd8/0Ofz+Xw+3z8AAAAAAADfP9AX9AV9Qd8/P/ADP/AD3z9xQkqeZUTfPwgffPDBB98/c/TN0TdH3z9FeqBydgvfP5/0SZ/0Sd8/Dw8PDw8P3z/ZLKj2nEzfP/EzSvyMEt8/964DujFP3z9f8RVf8RXfP3usZeiA3d4/u9ST8dul3j8wS8oBkeHeP8dxHMdxHN8/Kmj1pYJW3z/58ePHjx/fP7o3oExc6d4/KvJZN5gi3z/L4ox2D1vfP5NfLPnFkt8//iZ/k7/J3z9DeQ3lNZTfPyB1yh91yt8/cVZ+QpyV3z/LX7L8JcvfPwAAAAAAAOA/S3r50xYa4D8AAAAAAADgP742Yl16zN8/AAAAAAAA4D+P5g82Hs3fP1ikDDzdmt8/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "In terms of geographical distance between capital " + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "When you take the average of the standard populati" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "According to github, when was Regression added to " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "What is the last word before the second chorus of " + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ] + ], + "hovertemplate": "agent_name=code_o1_04_february_submission-medium
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_04_february_submission-medium", + "line": { + "color": "#ab63fa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_04_february_submission-medium", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdoaWprbG1ub3BxcnN0dXZ3eHl6e3w=", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADkP3Icx3Ecx+E/MzMzMzMz4z9ddNFFF13kP1VVVVVVVeU/dmIndmIn5j8lSZIkSZLkP1VVVVVVVeU/AAAAAAAA5D/T0tLS0tLiP3Icx3Ecx+E/bCivobyG4j+amZmZmZnhP5IkSZIkSeI/6aKLLrro4j8hC1nIQhbiP1VVVVVVVeE/7FG4HoXr4T+xEzuxEzvhP3Icx3Ecx+E/kiRJkiRJ4j9PIyz3NMLiPyIiIiIiIuI/lVJKKaWU4j8AAAAAAADiP3TRRRdddOE/4uHh4eHh4T/xFV/xFV/hPzmO4ziO4+A/6wZT5LNu4D95DeU1lNfgP7ETO7ETO+E/zczMzMzM4D8sUbsStSvhPzEMwzAMw+A/R9wRd8Qd4T+66KKLLrrgP7AFW7AFW+A/C1nIQhay4D/E5ApicgXhP6uqqqqqquA/FbycgpdT4D+kcD0K16PgP/Hw8PDw8OA/2Ymd2Imd4D+pCcZb2efgP3sJ7SW0l+A/37D2DWvf4D8lSZIkSZLgP3kN5TWU1+A/3dMIyz2N4D+c1H15bEXgPwAAAAAAAOA/afM+xSVD4D+EEEIIIYTgPzEMwzAMw+A/AAAAAACA4D/wAz/wAz/gP3zwwQcffOA/b+FXYyI94D94eHh4eHjgPwRz7cBcO+A/UAd1UAd14D82BxKtsDngPxzHcRzHceA/ggMHDhw44D8AAAAAAADgP5NfLPnFkt8/AAAAAAAA4D/H1MDeMTXgPwAAAAAAAOA/Mb+rxU2Y3z8AAAAAAADgP1ikDDzdmt8/OB+D8zE43z+U8EZT59feP57neZ7ned4/Hh4eHh4e3j+hL+gL+oLePyCT4gUyKd4/jC666KKL3j/vda973evePz/pkz7pk94/3uM93uM93j/f9KY3vendP5dddtlll90/eDbqOxv13T9G2rECYaTdPwAAAAAAAN4/nkSmYbtZ3j+sD431obHePw2JeTtDYt4/FK5H4XoU3j8pMOnUPMjdPx4eHh4eHt4/zCI+gVRy3j92Yid2YifeP57neZ7ned4/dEhNMN7K3j83+4VYURrfP4X2EtpLaN8/6fFdOIge3z9r37D2DWvfP2P7Hb0ytt8/27Zt27Zt3z8SePshgbffPwAAAAAAAOA//HVJ5cO43z8jLPc0wnLfP9/yLd/yLd8/yFYEDSd13z+fejGfejHfP+/u7u7u7t4/xfuR03yt3j9cMgTraPPePzgfg/MxON8/fO+999573z8IrBxaZDvfPw==", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "When you take the average of the standard populati" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "According to github, when was Regression added to " + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ] + ], + "hovertemplate": "agent_name=code_o1_04_february_submission3
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_04_february_submission3", + "line": { + "color": "#FFA15A", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_04_february_submission3", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMA==", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/kiRJkiRJ4j8AAAAAAADkP1VVVVVVVeU/ZmZmZmZm5j9ddNFFF13kP6uqqqqqquI/sRM7sRM74T8AAAAAAADgPxEREREREeE/AAAAAAAA4j/x8PDw8PDgPwAAAAAAAOA/DeU1lNdQ3j8AAAAAAADgP57neZ7ned4/AAAAAAAA4D/qTW9605vePwAAAAAAAOA/pHA9Ctej4D8AAAAAAADgPwntJbSX0N4/btu2bdu23T9HWO5phOXePwAAAAAAAOA/hBBCCCGE4D8AAAAAAADgPwgffPDBB98/AAAAAAAA4D9QB3VQB3XgPwAAAAAAAOA/6wZT5LNu4D8AAAAAAADgP5AGaZAGaeA/AAAAAAAA4D9kcD4G52PgPwAAAAAAAOA/0Bf0BX1B3z8AAAAAAADgP7AFW7AFW+A/AAAAAAAA4D99Z6O+s1HfPwAAAAAAAOA/1ofG+tBY3z8=", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "When you take the average of the standard populati" + ], + [ + "The object in the British Museum's collection with" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ] + ], + "hovertemplate": "agent_name=code_o1_04_february_submission4
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_04_february_submission4", + "line": { + "color": "#19d3f3", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_04_february_submission4", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQF", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjPwAAAAAAAOA/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "When you take the average of the standard populati" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "According to github, when was Regression added to " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "What is the last word before the second chorus of " + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "What is the final numeric output from the attached" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "A 5-man group made up of one tank, one healer, and" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "What is the latest chronological year date written" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ] + ], + "hovertemplate": "agent_name=code_o1_04_february_submission5
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_04_february_submission5", + "line": { + "color": "#FF6692", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_04_february_submission5", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVeU/AAAAAAAA4D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADoPzmO4ziO4+g/mpmZmZmZ6T8vuuiiiy7qP6uqqqqqquo/O7ETO7ET6z9JkiRJkiTpP5qZmZmZmek/AAAAAAAA6j94eHh4eHjoP8dxHMdxHOc/Q3kN5TWU5z8AAAAAAADoPxiGYRiGYeg/RhdddNFF5z9kIQtZyELmP1VVVVVVVeU/exSuR+F65D8UO7ETO7HjP2gvob2E9uI/27Zt27Zt4z8Jyz2NsNzjPzMzMzMzM+M/lVJKKaWU4j8AAAAAAADjP22yySabbOI/09LS0tLS4j+SJEmSJEniP6uqqqqqquI/bzBFPusG4z/lNZTXUF7jPxQ7sRM7seM/AAAAAAAA5D9L1K5E7UrkP/Q8z/M8z+M/Bn1BX9AX5D+jiy666KLjPzMzMzMzM+M/OL3pTW964z9MriAmVxDjP1VVVVVVVeM/5hS8nIKX4z/Xo3A9CtfjPxQUFBQUFOQ/7MRO7MRO5D9NMN7KPofkP9pLaC+hveQ/XXTRRRdd5D8lSZIkSZLkP15DeQ3lNeQ/5p5GWO5p5D9fHlsRNJzkP0REREREROQ/JkOwjjbv4z+dc84555zjP/Q8z/M8z+M/AAAAAACA4z8zMzMzMzPjP+miiy666OI/oHJ2C78a4z9LS0tLS0vjPzDXDsy1A+M/4yu+4iu+4j9TT8Zvl3riP47jOI7jOOI/kB8/fvz44T+YIp91gyniP1nyiyW/WOI/bCivobyG4j8hzspPiLPiP/Mt3/It3+I/E+Z3tbgJ4z/NzMzMzMziP8HTrflhkeI/V6J2JWpX4j+/9pDLioHiP5IkSZIkSeI/EhISEhIS4j+PuCPuiDviP7xAJsULZOI/L7rooosu4j8g/ehHP/rhPyIiIiIiIuI/kiRJkiRJ4j+nN73pTW/iP5VSSimllOI/yhXE5Api4j9sKK+hvIbiP1VVVVVVVeI/EpmG7WZ54j+n4OUUvJziP2r9SoFav+I/j8L1KFyP4j/zIHf9bLHiP9PS0tLS0uI/cl4W8Qmk4j9iJ3ZiJ3biP5IkSZIkSeI/GG9ln0Nq4j/2C7GiND7iP+0ltJfQXuI/OyMVc6sz4j8J8pQgTwniP5gin3WDKeI/kiRJkiRJ4j94+yGBtx/iP3AfwX0E9+E/HYGirQbP4T+E5Z5GWO7hP9IgDdIgDeI/4qTuy2Mr4j9yTQRyTQTiPyIiIiIiIuI/y6BUmHg/4j+wjjbvU1ziPzbSYSMdNuI/U0oppZRS4j+TGARWDi3iP4IgCIIgCOI/iUQikUgk4j8AAAAAAADiP3fEHXFH3OE/gh/4gR/44T9q7oK/ihPiPy+66KKLLuI/kiRJkiRJ4j/l7BZ+NSbiPz0gWefKA+I/Hh4eHh4e4j/8cevyDjjiPyV+RomfUeI/oBvz9NFq4j87qIM6qIPiP8oVxOQKYuI/HUi0wuZA4j++CmZJOSDiP47jOI7jOOI/eoshnbcY4j8SI0aMGDHiP5IkSZIkSeI/DqbIZ91g4j+imo65RHjiP1nyiyW/WOI/kuZIc6Q54j8N5TWU11DiPxK9ZxK9Z+I/kiRJkiRJ4j8rEq8i8SriP9IgDdIgDeI/kROEu7Hv4T8NRKUjewbiP/P32oh16eE/zczMzMzM4T+w8Wj+YOPhP0bKwNOt+eE/yYB6pnLd4T/C+Ricj8HhP6YxYBoDpuE/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "A paper about AI regulation that was originally su" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "According to github, when was Regression added to " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "When you take the average of the standard populati" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "In the endnote found in the second-to-last paragra" + ] + ], + "hovertemplate": "agent_name=code_o1_22-01_managedagent-summary_planning
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_22-01_managedagent-summary_planning", + "line": { + "color": "#B6E880", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_22-01_managedagent-summary_planning", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQg==", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADwPwAAAAAAAPA/AAAAAAAA6D8zMzMzMzPjP1VVVVVVVeU/t23btm3b5j8AAAAAAADkP3Icx3Ecx+E/AAAAAAAA4D8XXXTRRRfdPwAAAAAAAOA/sRM7sRM74T8AAAAAAADgPxEREREREeE/AAAAAAAA4D8eHh4eHh7ePxzHcRzHcdw/KK+hvIby2j+amZmZmZnZPxiGYRiGYdg/RhdddNFF1z+RhSxkIQvZPwAAAAAAANg/mpmZmZmZ2T/ZiZ3YiZ3YP0J7Ce0ltNc/t23btm3b1j98GmG5pxHWP1VVVVVVVdU/pZRSSiml1D8AAAAAAADUP2WTTTbZZNM/tbS0tLS01D8WX/EVX/HVP1VVVVVVVdU/yWfdYIp81j9DeQ3lNZTXP9mJndiJndg/mpmZmZmZ2T/6GJyPwfnYP3qe53me59k/s6asKWvK2j8vuuiiiy7aP5qZmZmZmdk/pze96U1v2j9t1Hc26jvbPwAAAAAAANw/27Zt27Zt2z/hehSuR+HaP1paWlpaWto/O7ETO7ET2z+WfQ6pCcbbPxzHcRzHcdw/F1100UUX3T8lSZIkSZLcPxbTWUxnMd0/jbDc0wjL3T/msRVBw0ndP83MzMzMzNw/Q7CONu9T3D/fe++9997bP9u2bdu2bds/AAAAAAAA2z9bqZVaqZXaPyebbLLJJts/eqBydgu/2j8=", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "A paper about AI regulation that was originally su" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "According to github, when was Regression added to " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "When you take the average of the standard populati" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "In the year 2022, and before December, what does \"" + ] + ], + "hovertemplate": "agent_name=code_o1_25-01_visioon
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_25-01_visioon", + "line": { + "color": "#FF97FF", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_25-01_visioon", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ=", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAA8D8AAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnJP1VVVVVVVdU/27Zt27Zt2z8AAAAAAADYP1VVVVVVVdU/MzMzMzMz0z900UUXXXTRP1VVVVVVVdU/2Ymd2Imd2D+3bdu2bdvWP5qZmZmZmdk/AAAAAAAA2D+XlpaWlpbWPzmO4ziO49g/Q3kN5TWU1z9mZmZmZmbWP1VVVVVVVdU/XXTRRRdd1D9kIQtZyELWP1VVVVVVVdU/exSuR+F61D92Yid2YifWP1VVVVVVVdU/JUmSJEmS1D8Jyz2NsNzTPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADSP3TRRRdddNE/09LS0tLS0j/UQR3UQR3UP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP1VVVVVVVdU/ZmZmZmZm1j/blahdidrVP1VVVVVVVdU/lTVlTVlT1j/RRRdddNHVP1VVVVVVVdU/ZCELWchC1j9dQUyuICbXP6uqqqqqqtY/jfWhsT401j/D9Shcj8LVP1VVVVVVVdU/xU7sxE7s1D/Z55CaYLzVPw==", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "A paper about AI regulation that was originally su" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "The object in the British Museum's collection with" + ], + [ + "According to github, when was Regression added to " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "When you take the average of the standard populati" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "What is the latest chronological year date written" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ] + ], + "hovertemplate": "agent_name=code_o1_29-01_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o1_29-01_text", + "line": { + "color": "#FECB52", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o1_29-01_text", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKissLS4vMDEyMzQ1Njc4OTo7PD0+P0BBQkNERUZHSElKS0xNTk9QUVJTVFVWV1hZWltcXV5fYGFiY2RlZmdo", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVeU/AAAAAAAA4D+amZmZmZnZPwAAAAAAAOA/kiRJkiRJ4j8AAAAAAADgPxzHcRzHcdw/mpmZmZmZ2T9GF1100UXXP1VVVVVVVdU/2Ymd2Imd2D+3bdu2bdvWP5qZmZmZmdk/AAAAAAAA3D9aWlpaWlraPzmO4ziO49g/Q3kN5TWU1z9mZmZmZmbWP1VVVVVVVdU/XXTRRRdd1D9kIQtZyELWP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j+WexphuafRPxEREREREdE/hBBCCCGE0D8AAAAAAADQPwgffPDBB88/8fDw8PDw0D+SJEmSJEnSP+Q4juM4jtM/HEyRz7rB1D9RXkN5DeXVP5dv+ZZv+dY/AAAAAAAA2D9qV6J2JWrXPxiGYRiGYdg/9AV9QV/Q1z9GF1100UXXPxdswRZswdY/etOb3vSm1z9icgUxuYLYPwAAAAAAANg/4eUUvJyC1z8K16NwPQrXP5eWlpaWltY/dmIndmIn1j9ln0NqgvHWP0J7Ce0ltNc/cFj7hrVv2D9JkiRJkiTZPzGdxXQW09k/YbmnEZZ72j+Uui+PrQjaP5qZmZmZmdk/WEeb9yku2T/GGGOMMcbYPxiGYRiGYdg/AAAAAAAA2D8YeqEXeqHXPz744IMPPtg/SQ9Uzm7h1z+Ih4eHh4fXP4K5dmCuHdg/+Yqv+Iqv2D/RCpsDiVbYPwAAAAAAANg/vXr16tWr1z+fdYMp8lnXP+UXS36x5Nc/Q3kN5TWU1z9kamDvmBrYP9mJndiJndg/OrJnICod2T/NzMzMzMzYP5Ey8HRrftg/Mjgfg/Mx2D+q82sPuazYPxiGYRiGYdg/GBgYGBgY2D8k7og74o7YP+5phOWeRtg/AAAAAAAA2D983ete97rXP9iCLdiCLdg/2Ymd2Imd2D+GLGQhC1nYP8YYY4wxxtg/YnIFMbmC2D8LhJF2rEDYPwAAAAAAANg/2G6WJ5Fp2D801ofG+tDYPzbZZJNNNtk/mpmZmZmZ2T96kLt+tljZP7q5ubm5udk/i/gEUsl52T+xEzuxEzvZP9mP/diP/dg/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "What's the last line of the rhyme under the flavor" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "A paper about AI regulation that was originally su" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "In Valentina Re’s contribution to the 2017 book “W" + ], + [ + "I’m researching species that became invasive after" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "Compute the check digit the Tropicos ID for the Or" + ], + [ + "Could you help me out with this assignment? Our pr" + ], + [ + "Given this table defining * on the set S = {a, b, " + ], + [ + "What time was the Tri-Rail train that carried the " + ], + [ + "In the fictional language of Tizin, basic sentence" + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "In terms of geographical distance between capital " + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "I was trying to remember how well the Cheater Beat" + ], + [ + "The attached file contains a list of vendors in th" + ], + [ + "Review the chess position provided in the image. I" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "Who nominated the only Featured Article on English" + ], + [ + "The Latin root of the Yola word \"gimlie\" shares a " + ], + [ + "The attached file shows a list of books in the col" + ], + [ + "According to Google Finance, when was the first ye" + ], + [ + "Using bass clef notes, what is the age of someone " + ], + [ + "On a leap day before the year 2008, a joke was rem" + ], + [ + "On July 15, 2008, Phys.org published an article ab" + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "If there is anything that doesn't make sense in th" + ], + [ + "When you take the average of the standard populati" + ], + [ + "The following numbers function similarly to ISBN 1" + ], + [ + "In the year 2022, and before December, what does \"" + ], + [ + "What is the volume in milliliters of a system comp" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "The attached file lists accommodations in the reso" + ], + [ + "In the NIH translation of the original 1913 Michae" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ], + [ + "You are Van Helsing, a renowned vampire hunter. A " + ], + [ + "Find the value of x to the nearest tenth: Lx = (d/" + ], + [ + "You are a telecommunications engineer who wants to" + ], + [ + "According to Box Office Mojo's 2020 Worldwide Box " + ], + [ + "How many applicants for the job in the PDF are onl" + ], + [ + "As of the 2020 census, what was the population dif" + ], + [ + "The Metropolitan Museum of Art has a portrait in i" + ], + [ + "How many slides in this PowerPoint presentation me" + ], + [ + "This is a secret message my friend gave me. It say" + ], + [ + "According to wikipedia, how many Asian countries s" + ], + [ + "The work referenced in footnote 397 of Federico La" + ], + [ + "I was referencing each of the tables in the file f" + ], + [ + "In Nature journal's Scientific Reports conference " + ], + [ + "The attached file shows the locomotives in the col" + ], + [ + "How many nonindigenous crocodiles were found in Fl" + ], + [ + "As a comma separated list with no whitespace, usin" + ], + [ + "According to the World Bank, which countries had g" + ], + [ + "The attached spreadsheet contains the sales of men" + ], + [ + "Who composed the song that was performed by a roos" + ], + [ + "I'm making a grocery list for my mom, but she's a " + ], + [ + "According to github, when was Regression added to " + ], + [ + "In the 2018 VSCode blog post on replit.com, what w" + ], + [ + "Look at the attached image. The quiz is scored as " + ], + [ + "What writer is quoted by Merriam-Webster for the W" + ], + [ + "Examine the video at https://www.youtube.com/watch" + ], + [ + "Hi, I'm making a pie but I could use some help wit" + ], + [ + "In the Scikit-Learn July 2017 changelog, what othe" + ], + [ + "You are given this Excel file as a map. You start " + ], + [ + "How many images are there in the latest 2022 Lego " + ], + [ + "The attached image contains a Python script. Run t" + ], + [ + "I thought we could try a fun word puzzle together " + ], + [ + "On ScienceDirect, what is the difference to 3 deci" + ], + [ + "What is the final numeric output from the attached" + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "How many more blocks (also denoted as layers) in B" + ], + [ + "The longest-lived vertebrate is named after an isl" + ], + [ + "On the DeepFruits fruit detection graph on Connect" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "The attached PDF lists accommodations in the resor" + ], + [ + "This spreadsheet contains a list of clients for a " + ], + [ + "How many times was a Twitter/X post cited as a ref" + ], + [ + "During the first week of August 2015, one of the N" + ], + [ + "What is the surname of the equine veterinarian men" + ], + [ + "The YouTube channel Game Grumps began a Let’s Play" + ], + [ + "What is the last word before the second chorus of " + ], + [ + "Who did the actor who played Ray in the Polish-lan" + ], + [ + "I have the Standard plan in the image below, and I" + ], + [ + "In the endnote found in the second-to-last paragra" + ], + [ + "The book with the doi 10.1353/book.24372 concerns " + ], + [ + "Pull out the sentence in the following 5x7 block o" + ], + [ + "What is the latest chronological year date written" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + "Eva Draconis has a personal website which can be a" + ], + [ + "How many at bats did the Yankee with the most walk" + ], + [ + "According to Girls Who Code, how long did it take " + ], + [ + "The attached spreadsheet contains a list of books " + ], + [ + "How many pages if the 2023 IPCC report (85 pages v" + ], + [ + "It's May 2023, and I'm about to drive across the U" + ], + [ + "In Audre Lorde’s poem “Father Son and Holy Ghost”," + ], + [ + "On Cornell Law School website's legal information " + ], + [ + "How many edits were made to the Wikipedia page on " + ], + [ + "Consider the following symbols: 𒐜 𒐐𒐚\n\nThis is a n" + ], + [ + "On the BBC Earth YouTube video of the Top 5 Sillie" + ], + [ + "What is the absolute difference in tens of thousan" + ], + [ + "The attached spreadsheet lists the locomotives own" + ], + [ + "The attached file lists the locomotives owned by a" + ], + [ + "I’m thinking about selling my home, so I want to l" + ], + [ + "When was a picture of St. Thomas Aquinas first add" + ], + [ + "As of August 2023, who is the only winner of the U" + ], + [ + "Take the gender split from the 2011 Bulgarian cens" + ], + [ + "All of the individuals who formally held the posit" + ], + [ + "Hi, I was out sick from my classes on Friday, so I" + ], + [ + "If this whole pint is made up of ice cream, how ma" + ], + [ + "Which of the fruits shown in the 2008 painting \"Em" + ], + [ + "What country had the least number of athletes at t" + ], + [ + "In the YouTube 360 VR video from March 2018 narrat" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "Where were the Vietnamese specimens described by K" + ], + [ + "The cover of the August 2021 issue of Vogue shows " + ], + [ + "I'd like to learn more about some popular reality " + ], + [ + "I read a paper about multiwavelength observations " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "Given $x_0 = -5$ and $f(x) = x^3 + 4x^2 - 3x + 8$," + ], + [ + "A standard Rubik’s cube has been broken into cubes" + ], + [ + "According to the USGS, in what year was the Americ" + ], + [ + "The attached Excel file contains the sales of menu" + ], + [ + "I'm curious about how much information is availabl" + ], + [ + "What percentage of the total penguin population ac" + ], + [ + "As of May 2023, how many stops are between South S" + ], + [ + "According to Openreview.net, at the NeurIPS 2022 C" + ], + [ + "Of the cities within the United States where U.S. " + ], + [ + "Who are the pitchers with the number before and af" + ], + [ + "In the 2015 Metropolitan Museum of Art exhibition " + ], + [ + "On June 6, 2023, an article by Carolyn Collins Pet" + ], + [ + "What is the area of the green polygon in the attac" + ], + [ + "What is the first name of the only Malko Competiti" + ], + [ + "The brand that makes these harnesses the dogs are " + ], + [ + "The year is 2022. I am at the National Air and Spa" + ], + [ + "What was the actual enrollment count of the clinic" + ], + [ + "What was the complete title of the book in which t" + ], + [ + "Bob was invited to participate in a game show, and" + ], + [ + "In NASA's Astronomy Picture of the Day on 2006 Jan" + ], + [ + "At the two-minute mark in the YouTube video upload" + ], + [ + "In the film Goldfinger, what color was the object " + ], + [ + "A 5-man group made up of one tank, one healer, and" + ] + ], + "hovertemplate": "agent_name=code_o3-mini_03_february_remove-navigational
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_o3-mini_03_february_remove-navigational", + "line": { + "color": "#636efa", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_o3-mini_03_february_remove-navigational", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAABAAIAAwAEAAUABgAHAAgACQAKAAsADAANAA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAfACAAIQAiACMAJAAlACYAJwAoACkAKgArACwALQAuAC8AMAAxADIAMwA0ADUANgA3ADgAOQA6ADsAPAA9AD4APwBAAEEAQgBDAEQARQBGAEcASABJAEoASwBMAE0ATgBPAFAAUQBSAFMAVABVAFYAVwBYAFkAWgBbAFwAXQBeAF8AYABhAGIAYwBkAGUAZgBnAGgAaQBqAGsAbABtAG4AbwBwAHEAcgBzAHQAdQB2AHcAeAB5AHoAewB8AH0AfgB/AIAAgQCCAIMAhACFAIYAhwCIAIkAigCLAIwAjQCOAI8AkACRAJIAkwCUAJUAlgCXAJgAmQCaAJsAnACdAJ4AnwCgAKEAogCjAKQA", + "dtype": "i2" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnJP1VVVVVVVcU/kiRJkiRJwj8AAAAAAADAPxzHcRzHccw/mpmZmZmZyT9GF1100UXHPwAAAAAAANA/FDuxEzux0z+SJEmSJEnSP1VVVVVVVdU/AAAAAAAA1D/T0tLS0tLSP3Icx3Ecx9E/XkN5DeU11D9mZmZmZmbWP1VVVVVVVdU/RhdddNFF1z9kIQtZyELWP1VVVVVVVdU/exSuR+F61D8UO7ETO7HTP2gvob2E9tI/kiRJkiRJ0j+WexphuafRPzMzMzMzM9M/lVJKKaWU0j8AAAAAAADSP2WTTTbZZNM/09LS0tLS0j+SJEmSJEnSP3Icx3Ecx9E/whT5rBtM0T9sKK+hvIbSP9IgDdIgDdI/mpmZmZmZ0T+7ErUrUbvSP5IkSZIkSdI/1pQ1ZU1Z0z9ddNFFF13UP5Q+6ZM+6dM/OL3pTW960z9MriAmVxDTP6uqqqqqqtI/kiRJkiRJ0j8zMzMzMzPTP9PS0tLS0tI/FDuxEzux0z/BeCv7HFLTP19CewntJdQ/yFOCPCXI0z/btm3btm3TP2cxncV0FtM/Ccs9jbDc0z/vy2MrgobTPzMzMzMzM9M/JkOwjjbv0z+llFJKKaXUP1VVVVVVVdU/AAAAAAAA1T+WWqmVWqnVP1VVVVVVVdU/F341JtID1T+mpaWlpaXVP1VVVVVVVdU/Fl/xFV/x1T9ItMLmQKLVP1VVVVVVVdU/r169evXq1T/yWTeYIp/VP1VVVVVVVdU/2FBeQ3kN1T/sHVMDe8fUP1VVVVVVVdU/ICod2TMQ1T/NzMzMzMzUPwaebs0Pi9Q/S9SuRO1K1D/6tYdcVgzUPyVJkiRJktQ/VFRUVFRU1D8GfUFf0BfUPwnLPY2w3NM/o4suuuii0z83talNbWrTP5Q+6ZM+6dM/VEZlVEZl1D9DFrKQhSzUP6WUUkoppdQ/Ut/ZqO9s1D8mTv2eW+LUP1VVVVVVVdU/Ffji6gcd1T85BS+n4OXUP1VVVVVVVdU/H4XrUbge1T/L8I0oMOnUP7W0tLS0tNQ/k/OyiE8g1T/FTuzETuzUP5VLuZRLudQ/E4y3ss8h1T9RGh+ZQO/UP1VVVVVVVdU/NFIxtzoj1T+HtW9Y+4bVP1VVVVVVVdU/SZIkSZIk1T/DSk8trPTUP1pMZzGdxdQ/SeXDuF+X1D/mnkZY7mnUP9RDPdRDPdQ/J3VfHlsR1D+U3W+U3W/UP0RERERERNQ/69khcGMZ1D8mQ7CONu/TP0vUrkTtStQ/IYQQQggh1D97FK5H4XrUPxRFURRFUdQ/CoVCoVAo1D8AAAAAAADUP/aEPWFP2NM/FDuxEzux0z+Hae6Cv4rTP+GDDz744NM/qzut7rS60z9+NSbSA5XTP/42xajhb9M/S0tLS0tL0z8xNguqPSfTPzDXDsy1A9M/UfxFzrDg0j8zMzMzMzPTP0yuICZXENM/K2wOJFph0z8UO7ETO7HTPwAAAAAAANQ/Ccs9jbDc0z+hQoUKFSrUPwJl4kr3BtQ/RT7rBlPk0z8M1XTMJcLTP6DTBjptoNM/n65P16fr0z+ivIbyGsrTP1T+qFP+qNM/zspPiLPy0z/SExw9wdHTPxQ7sRM7sdM/Qbgb+x6R0z/jJszvanHTP8F4K/scUtM/MzMzMzMz0z9Wigm6qxTTP2gvob2E9tI/n6lcd7zY0j+7ErUrUbvSP54S5ClBntI/", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "customdata": [ + [ + "A paper about AI regulation that was originally su" + ], + [ + "If we assume all articles published by Nature in 2" + ], + [ + "In Unlambda, what exact charcter or text needs to " + ], + [ + "I’m researching species that became invasive after" + ], + [ + "The attached spreadsheet shows the inventory for a" + ], + [ + "How many studio albums were published by Mercedes " + ], + [ + "If Eliud Kipchoge could maintain his record-making" + ], + [ + "The object in the British Museum's collection with" + ], + [ + "According to github, when was Regression added to " + ], + [ + "Here's a fun riddle that I think you'll enjoy.\n\nYo" + ], + [ + "Using the Biopython library in Python, parse the P" + ], + [ + "What are the EC numbers of the two most commonly u" + ], + [ + "In July 2, 1959 United States standards for grades" + ], + [ + "In April of 1977, who was the Prime Minister of th" + ], + [ + "Use density measures from the chemistry materials " + ], + [ + "What was the volume in m^3 of the fish bag that wa" + ], + [ + "What is the average number of pre-2020 works on th" + ], + [ + "In the video https://www.youtube.com/watch?v=L1vXC" + ], + [ + "Of the authors (First M. Last) that worked on the " + ], + [ + "When you take the average of the standard populati" + ], + [ + "Assuming scientists in the famous youtube video Th" + ], + [ + "In Series 9, Episode 11 of Doctor Who, the Doctor " + ], + [ + "In terms of geographical distance between capital " + ], + [ + "In the NCATS PubChem compound database for Food Ad" + ], + [ + "I need to fact-check a citation. This is the citat" + ], + [ + "Which contributor to the version of OpenCV where s" + ], + [ + "What integer-rounded percentage of the total lengt" + ], + [ + "An office held a Secret Santa gift exchange where " + ], + [ + "What is the maximum length in meters of #9 in the " + ], + [ + "What two-word type of model did Manash Pratim Kash" + ], + [ + "What animals that were mentioned in both Ilias Lag" + ], + [ + "How many High Energy Physics - Lattice articles li" + ], + [ + "The photograph in the Whitney Museum of American A" + ], + [ + ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht eti" + ], + [ + "What is the minimum number of page links a person " + ], + [ + "Each cell in the attached spreadsheet represents a" + ], + [ + "Which of the text elements under CATEGORIES in the" + ], + [ + "I went to Virtue restaurant & bar in Chicago for m" + ], + [ + "¬(A ∧ B) ↔ (¬A ∨ ¬B)\n¬(A ∨ B) ↔ (¬A ∧ ¬B)\n(A → B) " + ], + [ + "My family reunion is this week, and I was assigned" + ], + [ + "In Emily Midkiff's June 2014 article in a journal " + ], + [ + "It is 1999. Before you party like it is 1999, plea" + ], + [ + "Under DDC 633 on Bielefeld University Library's BA" + ] + ], + "hovertemplate": "agent_name=code_qwen-coder-32B_03_february_text
index=%{x}
is_correct=%{y}
question=%{customdata[0]}", + "legendgroup": "code_qwen-coder-32B_03_february_text", + "line": { + "color": "#EF553B", + "dash": "solid" + }, + "marker": { + "symbol": "circle" + }, + "mode": "lines", + "name": "code_qwen-coder-32B_03_february_text", + "showlegend": true, + "type": "scattergl", + "x": { + "bdata": "AAECAwQFBgcICQoLDA0ODxAREhMUFRYXGBkaGxwdHh8gISIjJCUmJygpKg==", + "dtype": "i1" + }, + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAADgP1VVVVVVVdU/AAAAAAAA0D+amZmZmZnZP1VVVVVVVdU/kiRJkiRJ0j8AAAAAAADQPxzHcRzHccw/mpmZmZmZyT9GF1100UXHP1VVVVVVVcU/FDuxEzuxwz+SJEmSJEnCP5qZmZmZmck/AAAAAAAA0D8eHh4eHh7OPxzHcRzHccw/KK+hvIbyyj+amZmZmZnJP57neZ7nec4/F1100UUXzT+96U1vetPLP6uqqqqqqso/mpmZmZmZyT/ZiZ3YiZ3IP0J7Ce0ltMc/t23btm3bxj98GmG5pxHGP1VVVVVVVcU/pZRSSimlxD8AAAAAAADEP2WTTTbZZMM/l5aWlpaWxj8WX/EVX/HFPzmO4ziO48g/doMp8lk3yD9DeQ3lNZTHPxqkQRqkQco/zczMzMzMzD8ZnI/B+RjMP9u2bdu2bcs/s6asKWvKyj8=", + "dtype": "f8" + }, + "yaxis": "y" + } + ], + "layout": { + "legend": { + "title": { + "text": "agent_name" + }, + "tracegroupgap": 0 + }, + "margin": { + "t": 60 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermap": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermap" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "index" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "is_correct" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly.express as px\n", + "\n", + "\n", + "cumulative_df = (\n", + " (\n", + " sel_df.groupby(\"agent_name\")[[\"is_correct\", \"is_near_correct\"]]\n", + " .expanding(min_periods=1, axis=0, method=\"single\")\n", + " .agg({\"is_correct\": \"mean\", \"is_near_correct\": \"count\"})\n", + " .reset_index()\n", + " )\n", + " .copy()\n", + " .rename(columns={\"is_near_correct\": \"index\"})\n", + ")\n", + "cumulative_df[\"index\"] = cumulative_df[\"index\"].astype(int) - 1\n", + "\n", + "\n", + "def find_question(row):\n", + " try:\n", + " res = sel_df.loc[sel_df[\"agent_name\"] == row[\"agent_name\"], \"question\"].iloc[row[\"index\"]][:50]\n", + " return res\n", + " except Exception:\n", + " return \"\"\n", + "\n", + "\n", + "cumulative_df[\"question\"] = cumulative_df.apply(find_question, axis=1)\n", + "# cumulative_df[\"question\"] = [el[:50] for el in sel_df[\"question\"].values]\n", + "\n", + "# cumulative_df[\"is_correct\"] = cumulative_df[\"is_correct\"] * (165 - 68) / 165\n", + "\n", + "px.line(\n", + " cumulative_df,\n", + " color=\"agent_name\",\n", + " x=\"index\",\n", + " y=\"is_correct\",\n", + " hover_data=\"question\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Dive deeper into one run" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "165\n" + ] + } + ], + "source": [ + "sel_df = result_df.loc[result_df[\"agent_name\"] == o1]\n", + "print(len(sel_df))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Count errors" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:10: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n", + "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/2022001392.py:11: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "error_types = [\n", + " \"AgentParsingError\",\n", + " \"AgentExecutionError\",\n", + " \"AgentMaxIterationsError\",\n", + " \"AgentGenerationError\",\n", + "]\n", + "sel_df[error_types] = 0\n", + "sel_df[\"Count steps\"] = np.nan\n", + "\n", + "\n", + "def count_errors(row):\n", + " if isinstance(row[\"intermediate_steps\"], list):\n", + " row[\"Count steps\"] = len(row[\"intermediate_steps\"])\n", + " for step in row[\"intermediate_steps\"]:\n", + " if isinstance(step, dict) and \"error\" in step:\n", + " try:\n", + " row[str(step[\"error\"][\"error_type\"])] += 1\n", + " except Exception:\n", + " pass\n", + " return row\n", + "\n", + "\n", + "sel_df = sel_df.apply(count_errors, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "hovertemplate": "is_correct=False
variable=%{x}
Average count=%{y}", + "legendgroup": "False", + "marker": { + "color": "#636efa", + "pattern": { + "shape": "" + } + }, + "name": "False", + "orientation": "v", + "showlegend": true, + "textposition": "outside", + "type": "bar", + "x": [ + "AgentParsingError", + "AgentExecutionError", + "AgentMaxIterationsError", + "AgentGenerationError", + "Count steps" + ], + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACSJEmSJEkMQA==", + "dtype": "f8" + }, + "yaxis": "y" + }, + { + "hovertemplate": "is_correct=True
variable=%{x}
Average count=%{y}", + "legendgroup": "True", + "marker": { + "color": "#EF553B", + "pattern": { + "shape": "" + } + }, + "name": "True", + "orientation": "v", + "showlegend": true, + "textposition": "outside", + "type": "bar", + "x": [ + "AgentParsingError", + "AgentExecutionError", + "AgentMaxIterationsError", + "AgentGenerationError", + "Count steps" + ], + "xaxis": "x", + "y": { + "bdata": "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABPt+aHRcoIQA==", + "dtype": "f8" + }, + "yaxis": "y" + } + ], + "layout": { + "bargroupgap": 0, + "barmode": "group", + "height": 500, + "legend": { + "title": { + "text": "is_correct" + }, + "tracegroupgap": 0 + }, + "margin": { + "t": 60 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermap": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermap" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "width": 800, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "variable" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "Average count" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly.express as px\n", + "\n", + "\n", + "aggregate_errors = (\n", + " sel_df.groupby([\"is_correct\"])[error_types + [\"Count steps\"]].mean().reset_index().melt(id_vars=[\"is_correct\"])\n", + ")\n", + "\n", + "fig = px.bar(\n", + " aggregate_errors,\n", + " y=\"value\",\n", + " x=\"variable\",\n", + " color=\"is_correct\",\n", + " labels={\n", + " \"agent_name\": \"Model\",\n", + " \"task\": \"Level\",\n", + " \"aggregate_score\": \"Performance\",\n", + " \"value\": \"Average count\",\n", + " \"eval_score_GPT4\": \"Score\",\n", + " },\n", + ")\n", + "fig.update_layout(\n", + " height=500,\n", + " width=800,\n", + " barmode=\"group\",\n", + " bargroupgap=0.0,\n", + ")\n", + "fig.update_traces(textposition=\"outside\")\n", + "fig.write_image(\"aggregate_errors.png\", scale=3)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect result by file extension type" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
is_correctcount_stepsquestion
attachment_type
None0.4237994.9597252185
csv0.0000007.75000016
docx0.5714294.90476221
jpg0.1428575.75000028
jsonld0.0000006.60000015
mp30.4800004.50000050
pdb0.0000004.44444418
pdf0.5882354.13725551
png0.2167834.412587143
pptx0.8823534.05882417
py1.0000004.26666715
txt0.7058824.76470617
xlsx0.6127454.823529204
zip0.4482765.34482829
\n", + "
" + ], + "text/plain": [ + " is_correct count_steps question\n", + "attachment_type \n", + "None 0.423799 4.959725 2185\n", + "csv 0.000000 7.750000 16\n", + "docx 0.571429 4.904762 21\n", + "jpg 0.142857 5.750000 28\n", + "jsonld 0.000000 6.600000 15\n", + "mp3 0.480000 4.500000 50\n", + "pdb 0.000000 4.444444 18\n", + "pdf 0.588235 4.137255 51\n", + "png 0.216783 4.412587 143\n", + "pptx 0.882353 4.058824 17\n", + "py 1.000000 4.266667 15\n", + "txt 0.705882 4.764706 17\n", + "xlsx 0.612745 4.823529 204\n", + "zip 0.448276 5.344828 29" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " result_df.groupby([\"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n", + " {\"is_correct\": \"mean\", \"count_steps\": \"mean\", \"question\": \"count\"}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Ensembling methods" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "counts = result_df[\"agent_name\"].value_counts()\n", + "long_series = result_df.loc[result_df[\"agent_name\"].isin(counts[counts > 140].index)]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "agent_name\n", + "code_gpt4o_03_february_goodoldtext-unbroken 38.36\n", + "code_gpt4o_03_february_magenticbrowser 35.22\n", + "code_gpt4o_03_february_magenticbrowser2 36.54\n", + "code_gpt4o_03_february_text 37.58\n", + "code_o1_01_february_text 49.09\n", + "code_o1_03_february_ablation-toolcalling-manager 32.73\n", + "code_o1_03_february_fix-print-outputs 51.83\n", + "code_o1_03_february_fix-print-outputs2 55.77\n", + "code_o1_03_february_goodoldtext-unbroken 53.42\n", + "code_o1_03_february_remove-navigational 53.66\n", + "code_o1_03_february_text_high-reasoning-effort 48.48\n", + "code_o1_04_february_submission 49.38\n", + "code_o1_04_february_submission5 55.15\n", + "code_o3-mini_03_february_remove-navigational 29.09\n", + "Name: is_correct, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Majority score: 58.18\n", + "Oracle score: 72.73\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/6m/9b1tts6d5w960j80wbw9tx3m0000gn/T/ipykernel_25011/3287428472.py:20: DeprecationWarning:\n", + "\n", + "DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + "\n" + ] + } + ], + "source": [ + "def majority_vote(df):\n", + " df = df[(df[\"prediction\"] != \"Unable to determine\") & (~df[\"prediction\"].isna()) & (df[\"prediction\"] != \"None\")]\n", + "\n", + " answer_modes = df.groupby(\"question\")[\"prediction\"].agg(lambda x: x.mode()[0]).reset_index()\n", + " first_occurrences = (\n", + " df.groupby([\"question\", \"prediction\"]).agg({\"task\": \"first\", \"is_correct\": \"first\"}).reset_index()\n", + " )\n", + " result = answer_modes.merge(first_occurrences, on=[\"question\", \"prediction\"], how=\"left\")\n", + "\n", + " return result\n", + "\n", + "\n", + "def oracle(df):\n", + " def get_first_correct_or_first_wrong(group):\n", + " correct_answers = group[group[\"is_correct\"]]\n", + " if len(correct_answers) > 0:\n", + " return correct_answers.iloc[0]\n", + " return group.iloc[0]\n", + "\n", + " result = df.groupby(\"question\").apply(get_first_correct_or_first_wrong)\n", + "\n", + " return result.reset_index(drop=True)\n", + "\n", + "\n", + "display((long_series.groupby(\"agent_name\")[\"is_correct\"].mean() * 100).round(2))\n", + "print(f\"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}\")\n", + "print(f\"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "agent_run = \"code_o1_04_february_submission5.jsonl\"\n", + "df = pd.read_json(f\"output/validation/{agent_run}\", lines=True)\n", + "df = df[[\"task_id\", \"prediction\", \"intermediate_steps\"]]\n", + "df = df.rename(columns={\"prediction\": \"model_answer\", \"intermediate_steps\": \"reasoning_trace\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_json(\"submission.jsonl\", orient=\"records\", lines=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gaia", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/open_deep_research/requirements.txt b/examples/open_deep_research/requirements.txt new file mode 100644 index 000000000..a18936ae4 --- /dev/null +++ b/examples/open_deep_research/requirements.txt @@ -0,0 +1,39 @@ +anthropic>=0.37.1 +beautifulsoup4>=4.12.3 +datasets>=2.21.0 +google_search_results>=2.4.2 +huggingface_hub>=0.23.4 +mammoth>=1.8.0 +markdownify>=0.13.1 +numexpr>=2.10.1 +numpy>=2.1.2 +openai>=1.52.2 +openpyxl +pandas>=2.2.3 +pathvalidate>=3.2.1 +pdfminer>=20191125 +pdfminer.six>=20240706 +Pillow>=11.0.0 +puremagic>=1.28 +pypdf>=5.1.0 +python-dotenv>=1.0.1 +python_pptx>=1.0.2 +Requests>=2.32.3 +serpapi>=0.1.5 +tqdm>=4.66.4 +torch>=2.2.2 +torchvision>=0.17.2 +transformers>=4.46.0 +youtube_transcript_api>=0.6.2 +chess +sympy +pubchempy +Bio +scikit-learn +scipy +pydub +PyPDF2 +python-pptx +torch +xlrd +SpeechRecognition \ No newline at end of file diff --git a/examples/open_deep_research/run.py b/examples/open_deep_research/run.py new file mode 100644 index 000000000..2dcddab4f --- /dev/null +++ b/examples/open_deep_research/run.py @@ -0,0 +1,146 @@ +import argparse +import os +import threading + +from dotenv import load_dotenv +from huggingface_hub import login +from scripts.text_inspector_tool import TextInspectorTool +from scripts.text_web_browser import ( + ArchiveSearchTool, + FinderTool, + FindNextTool, + PageDownTool, + PageUpTool, + SearchInformationTool, + SimpleTextBrowser, + VisitTool, +) +from scripts.visual_qa import visualizer + +from smolagents import ( + CodeAgent, + # HfApiModel, + LiteLLMModel, + ToolCallingAgent, +) + + +AUTHORIZED_IMPORTS = [ + "requests", + "zipfile", + "os", + "pandas", + "numpy", + "sympy", + "json", + "bs4", + "pubchempy", + "xml", + "yahoo_finance", + "Bio", + "sklearn", + "scipy", + "pydub", + "io", + "PIL", + "chess", + "PyPDF2", + "pptx", + "torch", + "datetime", + "fractions", + "csv", +] +load_dotenv(override=True) +login(os.getenv("HF_TOKEN")) + +append_answer_lock = threading.Lock() + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "question", type=str, help="for example: 'How many studio albums did Mercedes Sosa release before 2007?'" + ) + parser.add_argument("--model-id", type=str, default="o1") + return parser.parse_args() + + +custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"} + +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + +BROWSER_CONFIG = { + "viewport_size": 1024 * 5, + "downloads_folder": "downloads_folder", + "request_kwargs": { + "headers": {"User-Agent": user_agent}, + "timeout": 300, + }, + "serpapi_key": os.getenv("SERPAPI_API_KEY"), +} + +os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True) + + +def main(): + args = parse_args() + text_limit = 100000 + + model = LiteLLMModel( + args.model_id, + custom_role_conversions=custom_role_conversions, + max_completion_tokens=8192, + reasoning_effort="high", + ) + document_inspection_tool = TextInspectorTool(model, text_limit) + + browser = SimpleTextBrowser(**BROWSER_CONFIG) + + WEB_TOOLS = [ + SearchInformationTool(browser), + VisitTool(browser), + PageUpTool(browser), + PageDownTool(browser), + FinderTool(browser), + FindNextTool(browser), + ArchiveSearchTool(browser), + TextInspectorTool(model, text_limit), + ] + + text_webbrowser_agent = ToolCallingAgent( + model=model, + tools=WEB_TOOLS, + max_steps=20, + verbosity_level=2, + planning_interval=4, + name="search_agent", + description="""A team member that will search the internet to answer your question. + Ask him for all your questions that require browsing the web. + Provide him as much context as possible, in particular if you need to search on a specific timeframe! + And don't hesitate to provide him with a complex search task, like finding a difference between two webpages. + Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords. + """, + provide_run_summary=True, + ) + text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files. + If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it. + Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.""" + + manager_agent = CodeAgent( + model=model, + tools=[visualizer, document_inspection_tool], + max_steps=12, + verbosity_level=2, + additional_authorized_imports=AUTHORIZED_IMPORTS, + planning_interval=4, + managed_agents=[text_webbrowser_agent], + ) + + answer = manager_agent.run(args.question) + + print(f"Got this answer: {answer}") + + +if __name__ == "__main__": + main() diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py new file mode 100644 index 000000000..fa59fc03e --- /dev/null +++ b/examples/open_deep_research/run_gaia.py @@ -0,0 +1,297 @@ +import argparse +import json +import os +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime +from pathlib import Path +from typing import List + +import datasets +import pandas as pd +from dotenv import load_dotenv +from huggingface_hub import login +from scripts.reformulator import prepare_response +from scripts.run_agents import ( + get_single_file_description, + get_zip_description, +) +from scripts.text_inspector_tool import TextInspectorTool +from scripts.text_web_browser import ( + ArchiveSearchTool, + FinderTool, + FindNextTool, + PageDownTool, + PageUpTool, + SearchInformationTool, + SimpleTextBrowser, + VisitTool, +) +from scripts.visual_qa import visualizer +from tqdm import tqdm + +from smolagents import ( + CodeAgent, + # HfApiModel, + LiteLLMModel, + Model, + ToolCallingAgent, +) + + +AUTHORIZED_IMPORTS = [ + "requests", + "zipfile", + "os", + "pandas", + "numpy", + "sympy", + "json", + "bs4", + "pubchempy", + "xml", + "yahoo_finance", + "Bio", + "sklearn", + "scipy", + "pydub", + "io", + "PIL", + "chess", + "PyPDF2", + "pptx", + "torch", + "datetime", + "fractions", + "csv", +] +load_dotenv(override=True) +login(os.getenv("HF_TOKEN")) + +append_answer_lock = threading.Lock() + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--concurrency", type=int, default=8) + parser.add_argument("--model-id", type=str, default="o1") + parser.add_argument("--run-name", type=str, required=True) + return parser.parse_args() + + +### IMPORTANT: EVALUATION SWITCHES + +print("Make sure you deactivated Tailscale VPN, else some URLs will be blocked!") + +USE_OPEN_MODELS = False + +SET = "validation" + +custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"} + +### LOAD EVALUATION DATASET + +eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET] +eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"}) + + +def preprocess_file_paths(row): + if len(row["file_name"]) > 0: + row["file_name"] = f"data/gaia/{SET}/" + row["file_name"] + return row + + +eval_ds = eval_ds.map(preprocess_file_paths) +eval_df = pd.DataFrame(eval_ds) +print("Loaded evaluation dataset:") +print(eval_df["task"].value_counts()) + +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + +BROWSER_CONFIG = { + "viewport_size": 1024 * 5, + "downloads_folder": "downloads_folder", + "request_kwargs": { + "headers": {"User-Agent": user_agent}, + "timeout": 300, + }, + "serpapi_key": os.getenv("SERPAPI_API_KEY"), +} + +os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True) + + +def create_agent_hierarchy(model: Model): + text_limit = 100000 + ti_tool = TextInspectorTool(model, text_limit) + + browser = SimpleTextBrowser(**BROWSER_CONFIG) + + WEB_TOOLS = [ + SearchInformationTool(browser), + VisitTool(browser), + PageUpTool(browser), + PageDownTool(browser), + FinderTool(browser), + FindNextTool(browser), + ArchiveSearchTool(browser), + TextInspectorTool(model, text_limit), + ] + text_webbrowser_agent = ToolCallingAgent( + model=model, + tools=WEB_TOOLS, + max_steps=20, + verbosity_level=2, + planning_interval=4, + name="search_agent", + description="""A team member that will search the internet to answer your question. + Ask him for all your questions that require browsing the web. + Provide him as much context as possible, in particular if you need to search on a specific timeframe! + And don't hesitate to provide him with a complex search task, like finding a difference between two webpages. + Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords. + """, + provide_run_summary=True, + ) + text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files. + If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it. + Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.""" + + manager_agent = CodeAgent( + model=model, + tools=[visualizer, ti_tool], + max_steps=12, + verbosity_level=2, + additional_authorized_imports=AUTHORIZED_IMPORTS, + planning_interval=4, + managed_agents=[text_webbrowser_agent], + ) + return manager_agent + + +def append_answer(entry: dict, jsonl_file: str) -> None: + jsonl_file = Path(jsonl_file) + jsonl_file.parent.mkdir(parents=True, exist_ok=True) + with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp: + fp.write(json.dumps(entry) + "\n") + assert os.path.exists(jsonl_file), "File not found!" + print("Answer exported to file:", jsonl_file.resolve()) + + +def answer_single_question(example, model_id, answers_file, visual_inspection_tool): + model = LiteLLMModel( + model_id, + custom_role_conversions=custom_role_conversions, + max_completion_tokens=8192, + reasoning_effort="high", + ) + # model = HfApiModel("Qwen/Qwen2.5-72B-Instruct", provider="together") + # "https://lnxyuvj02bpe6mam.us-east-1.aws.endpoints.huggingface.cloud", + # custom_role_conversions=custom_role_conversions, + # # provider="sambanova", + # max_tokens=8096, + # ) + document_inspection_tool = TextInspectorTool(model, 100000) + + agent = create_agent_hierarchy(model) + + augmented_question = """You have one question to answer. It is paramount that you provide a correct answer. +Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded. +Run verification steps if that's needed, you must make sure you find the correct answer! +Here is the task: +""" + example["question"] + + if example["file_name"]: + if ".zip" in example["file_name"]: + prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n" + prompt_use_files += get_zip_description( + example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool + ) + else: + prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:" + prompt_use_files += get_single_file_description( + example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool + ) + augmented_question += prompt_use_files + + start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + try: + # Run agent 🚀 + final_result = agent.run(augmented_question) + + agent_memory = agent.write_memory_to_messages(summary_mode=True) + + final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model) + + output = str(final_result) + for memory_step in agent.memory.steps: + memory_step.model_input_messages = None + intermediate_steps = [str(step) for step in agent.memory.steps] + + # Check for parsing errors which indicate the LLM failed to follow the required format + parsing_error = True if any(["AgentParsingError" in step for step in intermediate_steps]) else False + + # check if iteration limit exceeded + iteration_limit_exceeded = True if "Agent stopped due to iteration limit or time limit." in output else False + raised_exception = False + + except Exception as e: + print("Error on ", augmented_question, e) + output = None + intermediate_steps = [] + parsing_error = False + iteration_limit_exceeded = False + exception = e + raised_exception = True + end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + annotated_example = { + "agent_name": model.model_id, + "question": example["question"], + "augmented_question": augmented_question, + "prediction": output, + "intermediate_steps": intermediate_steps, + "parsing_error": parsing_error, + "iteration_limit_exceeded": iteration_limit_exceeded, + "agent_error": str(exception) if raised_exception else None, + "start_time": start_time, + "end_time": end_time, + "task": example["task"], + "task_id": example["task_id"], + "true_answer": example["true_answer"], + } + append_answer(annotated_example, answers_file) + + +def get_examples_to_answer(answers_file, eval_ds) -> List[dict]: + print(f"Loading answers from {answers_file}...") + try: + done_questions = pd.read_json(answers_file, lines=True)["question"].tolist() + print(f"Found {len(done_questions)} previous results!") + except Exception as e: + print("Error when loading records: ", e) + print("No usable records! ▶️ Starting new.") + done_questions = [] + return [line for line in eval_ds.to_list() if line["question"] not in done_questions] + + +def main(): + args = parse_args() + print(f"Starting run with arguments: {args}") + + answers_file = f"output/{SET}/{args.run_name}.jsonl" + tasks_to_run = get_examples_to_answer(answers_file, eval_ds) + + with ThreadPoolExecutor(max_workers=args.concurrency) as exe: + futures = [ + exe.submit(answer_single_question, example, args.model_id, answers_file, visualizer) + for example in tasks_to_run + ] + for f in tqdm(as_completed(futures), total=len(tasks_to_run), desc="Processing tasks"): + f.result() + + # for example in tasks_to_run: + # answer_single_question(example, args.model_id, answers_file, visualizer) + print("All tasks processed.") + + +if __name__ == "__main__": + main() diff --git a/examples/open_deep_research/scripts/cookies.py b/examples/open_deep_research/scripts/cookies.py new file mode 100644 index 000000000..8e4233356 --- /dev/null +++ b/examples/open_deep_research/scripts/cookies.py @@ -0,0 +1,715 @@ +from requests.cookies import RequestsCookieJar + + +COOKIES_LIST = [ + { + "domain": ".youtube.com", + "expirationDate": 1718884961, + "hostOnly": False, + "httpOnly": False, + "name": "ST-xuwub9", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753004444.745411, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-YEC", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050824, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-3PSID", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420959.974642, + "hostOnly": False, + "httpOnly": False, + "name": "SIDCC", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050652, + "hostOnly": False, + "httpOnly": False, + "name": "SID", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420958.397534, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-1PSIDTS", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753433494.44729, + "hostOnly": False, + "httpOnly": False, + "name": "_ga_M0180HEFCY", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GS1.1.1718871908.1.0.1718873494.0.0.0", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050933, + "hostOnly": False, + "httpOnly": False, + "name": "SAPISID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420959.974764, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-1PSIDCC", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050881, + "hostOnly": False, + "httpOnly": True, + "name": "SSID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "AmlwXHnQvOQ10LVd-", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050959, + "hostOnly": False, + "httpOnly": False, + "name": "__Secure-1PAPISID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050795, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-1PSID", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050993, + "hostOnly": False, + "httpOnly": False, + "name": "__Secure-3PAPISID", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420959.974815, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-3PSIDCC", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg", + }, + { + "domain": ".youtube.com", + "expirationDate": 1750420958.397647, + "hostOnly": False, + "httpOnly": True, + "name": "__Secure-3PSIDTS", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050908, + "hostOnly": False, + "httpOnly": False, + "name": "APISID", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753434620.050855, + "hostOnly": False, + "httpOnly": True, + "name": "HSID", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "AasA7hmRuTFv7vjoq", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753435873.577793, + "hostOnly": False, + "httpOnly": True, + "name": "LOGIN_INFO", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3", + }, + { + "domain": ".youtube.com", + "expirationDate": 1753444956.555608, + "hostOnly": False, + "httpOnly": False, + "name": "PREF", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100", + }, +] + +COOKIES_LIST += [ + { + "domain": ".www.researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "isInstIp", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "False", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1734423981, + "hostOnly": False, + "httpOnly": False, + "name": "__eoi", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc", + }, + { + "domain": ".www.researchgate.net", + "expirationDate": 1753444909.646103, + "hostOnly": False, + "httpOnly": True, + "name": "ptc", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "RG1.8947708639250500550.1718872043", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1750507578, + "hostOnly": False, + "httpOnly": False, + "name": "euconsent-v2-didomi", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1718885236, + "hostOnly": False, + "httpOnly": False, + "name": "_gat", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "1", + }, + { + "domain": "www.researchgate.net", + "expirationDate": 1721477183, + "hostOnly": True, + "httpOnly": False, + "name": "_pbjs_userid_consent_data", + "path": "/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "3524755945110770", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1752567981, + "hostOnly": False, + "httpOnly": False, + "name": "__gads", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1718886709.646173, + "hostOnly": False, + "httpOnly": True, + "name": "__cf_bm", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": False, + "storeId": None, + "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1752567981, + "hostOnly": False, + "httpOnly": False, + "name": "__gpi", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg", + }, + { + "domain": ".researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "_cfuvid", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": True, + "storeId": None, + "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1753445177.271667, + "hostOnly": False, + "httpOnly": False, + "name": "_ga", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.1.1525244793.1718885177", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1753445177.271482, + "hostOnly": False, + "httpOnly": False, + "name": "_ga_4P31SJ70EJ", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GS1.1.1718885177.1.0.1718885177.0.0.0", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1718971576, + "hostOnly": False, + "httpOnly": False, + "name": "_gid", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.2.854907463.1718885177", + }, + { + "domain": ".www.researchgate.net", + "expirationDate": 1750407982.506505, + "hostOnly": False, + "httpOnly": True, + "name": "did", + "path": "/", + "sameSite": None, + "secure": True, + "session": False, + "storeId": None, + "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1750507578, + "hostOnly": False, + "httpOnly": False, + "name": "didomi_token", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9", + }, + { + "domain": ".www.researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "hasPdpNext", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "False", + }, + { + "domain": ".researchgate.net", + "expirationDate": 1750421183, + "hostOnly": False, + "httpOnly": False, + "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D", + }, + { + "domain": ".www.researchgate.net", + "hostOnly": False, + "httpOnly": True, + "name": "sid", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ", + }, +] + +COOKIES_LIST += [ + { + "domain": "github.com", + "hostOnly": True, + "httpOnly": True, + "name": "_gh_sess", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": True, + "storeId": None, + "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D", + }, + { + "domain": ".github.com", + "expirationDate": 1750408875.763785, + "hostOnly": False, + "httpOnly": False, + "name": "_octo", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "GH1.1.728652011.1718872875", + }, + { + "domain": ".github.com", + "expirationDate": 1750408875.763926, + "hostOnly": False, + "httpOnly": True, + "name": "logged_in", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": False, + "storeId": None, + "value": "no", + }, + { + "domain": ".github.com", + "hostOnly": False, + "httpOnly": False, + "name": "preferred_color_mode", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": True, + "storeId": None, + "value": "dark", + }, + { + "domain": ".github.com", + "hostOnly": False, + "httpOnly": False, + "name": "tz", + "path": "/", + "sameSite": "lax", + "secure": True, + "session": True, + "storeId": None, + "value": "Europe%2FParis", + }, +] + +COOKIES_LIST += [ + { + "domain": ".web.archive.org", + "expirationDate": 1718886430, + "hostOnly": False, + "httpOnly": False, + "name": "_gat", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "1", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1718972770, + "hostOnly": False, + "httpOnly": False, + "name": "_gid", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.2.402246368.1606169825", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1753446370.315621, + "hostOnly": False, + "httpOnly": False, + "name": "_ga", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.2.1301409987.1606169825", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1750422367, + "hostOnly": False, + "httpOnly": False, + "name": "_hjid", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2", + }, + { + "domain": ".web.archive.org", + "expirationDate": 1718888167, + "hostOnly": False, + "httpOnly": False, + "name": "_hjFirstSeen", + "path": "/web/20201123221659/http://orcid.org/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "1", + }, +] +COOKIES_LIST += [ + { + "domain": "orcid.org", + "hostOnly": True, + "httpOnly": False, + "name": "AWSELBCORS", + "path": "/", + "sameSite": "no_restriction", + "secure": True, + "session": True, + "storeId": None, + "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F", + }, + { + "domain": ".orcid.org", + "expirationDate": 1753452454.637671, + "hostOnly": False, + "httpOnly": False, + "name": "_ga_9R61FWK9H5", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GS1.1.1718892454.1.0.1718892454.0.0.0", + }, + { + "domain": ".orcid.org", + "expirationDate": 1753452454.63421, + "hostOnly": False, + "httpOnly": False, + "name": "_ga", + "path": "/", + "sameSite": None, + "secure": False, + "session": False, + "storeId": None, + "value": "GA1.1.2021310691.1718892455", + }, + { + "domain": "orcid.org", + "hostOnly": True, + "httpOnly": False, + "name": "AWSELB", + "path": "/", + "sameSite": None, + "secure": False, + "session": True, + "storeId": None, + "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F", + }, + { + "domain": ".orcid.org", + "expirationDate": 1750428454, + "hostOnly": False, + "httpOnly": False, + "name": "OptanonAlertBoxClosed", + "path": "/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "2024-06-20T14:07:34.583Z", + }, + { + "domain": ".orcid.org", + "expirationDate": 1750428454, + "hostOnly": False, + "httpOnly": False, + "name": "OptanonConsent", + "path": "/", + "sameSite": "lax", + "secure": False, + "session": False, + "storeId": None, + "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1", + }, + { + "domain": "orcid.org", + "hostOnly": True, + "httpOnly": False, + "name": "XSRF-TOKEN", + "path": "/", + "sameSite": None, + "secure": True, + "session": True, + "storeId": None, + "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9", + }, +] + +# Create a RequestsCookieJar instance +COOKIES = RequestsCookieJar() + +# Add cookies to the jar +for cookie in COOKIES_LIST: + COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"]) diff --git a/examples/open_deep_research/scripts/gaia_scorer.py b/examples/open_deep_research/scripts/gaia_scorer.py new file mode 100644 index 000000000..532e0c380 --- /dev/null +++ b/examples/open_deep_research/scripts/gaia_scorer.py @@ -0,0 +1,124 @@ +import re +import string +import warnings + + +def normalize_number_str(number_str: str) -> float: + # we replace these common units and commas to allow + # conversion to float + for char in ["$", "%", ","]: + number_str = number_str.replace(char, "") + try: + return float(number_str) + except ValueError: + print(f"String {number_str} cannot be normalized to number str.") + return float("inf") + + +def split_string( + s: str, + char_list: list[str] = [",", ";"], +) -> list[str]: + pattern = f"[{''.join(char_list)}]" + return re.split(pattern, s) + + +def is_float(element: any) -> bool: + try: + float(element) + return True + except ValueError: + return False + + +def question_scorer( + model_answer: str, + ground_truth: str, +) -> bool: + # if gt is a number + if is_float(ground_truth): + normalized_answer = normalize_number_str(str(model_answer)) + return normalized_answer == float(ground_truth) + + # if gt is a list + elif any(char in ground_truth for char in [",", ";"]): + # question with the fish: normalization removes punct + + gt_elems = split_string(ground_truth) + ma_elems = split_string(model_answer) + + # check length is the same + if len(gt_elems) != len(ma_elems): + warnings.warn("Answer lists have different lengths, returning False.", UserWarning) + return False + + # compare each element as float or str + comparisons = [] + for ma_elem, gt_elem in zip(ma_elems, gt_elems): + if is_float(gt_elem): + normalized_ma_elem = normalize_number_str(ma_elem) + comparisons.append(normalized_ma_elem == float(gt_elem)) + else: + # we do not remove punct since comparisons can include punct + comparisons.append( + normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False) + ) + return all(comparisons) + + # if gt is a str + else: + return normalize_str(model_answer) == normalize_str(ground_truth) + + +def check_prediction_contains_answer_letters_in_order(prediction, true_answer): + prediction = prediction.lower() + true_answer = true_answer.lower() + if len(prediction) > len(true_answer) * 3: + return False + i = 0 + for letter in true_answer: + if letter in prediction[i:]: + i += prediction[i:].index(letter) + else: + return False + return True + + +def check_close_call(prediction, true_answer, is_correct): + if is_correct: + return True + else: + if is_float(true_answer): + return is_correct + else: + if ( + check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer)) + and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2 + ): + print(f"Close call: {prediction} vs {true_answer}") + return True + else: + return False + + +def normalize_str(input_str, remove_punct=True) -> str: + """ + Normalize a string by: + - Removing all white spaces + - Optionally removing punctuation (if remove_punct is True) + - Converting to lowercase + Parameters: + - input_str: str, the string to normalize + - remove_punct: bool, whether to remove punctuation (default: True) + Returns: + - str, the normalized string + """ + # Remove all white spaces. Required e.g for seagull vs. sea gull + no_spaces = re.sub(r"\s", "", input_str) + + # Remove punctuation, if specified. + if remove_punct: + translator = str.maketrans("", "", string.punctuation) + return no_spaces.lower().translate(translator) + else: + return no_spaces.lower() diff --git a/examples/open_deep_research/scripts/mdconvert.py b/examples/open_deep_research/scripts/mdconvert.py new file mode 100644 index 000000000..68f13a28b --- /dev/null +++ b/examples/open_deep_research/scripts/mdconvert.py @@ -0,0 +1,1002 @@ +# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py +# Thanks to Microsoft researchers for open-sourcing this! +# type: ignore +import base64 +import copy +import html +import json +import mimetypes +import os +import re +import shutil +import subprocess +import sys +import tempfile +import traceback +import zipfile +from typing import Any, Dict, List, Optional, Union +from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse + +import mammoth +import markdownify +import pandas as pd +import pdfminer +import pdfminer.high_level +import pptx + +# File-format detection +import puremagic +import pydub +import requests +import speech_recognition as sr +from bs4 import BeautifulSoup +from youtube_transcript_api import YouTubeTranscriptApi +from youtube_transcript_api.formatters import SRTFormatter + + +class _CustomMarkdownify(markdownify.MarkdownConverter): + """ + A custom version of markdownify's MarkdownConverter. Changes include: + + - Altering the default heading style to use '#', '##', etc. + - Removing javascript hyperlinks. + - Truncating images with large data:uri sources. + - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax + """ + + def __init__(self, **options: Any): + options["heading_style"] = options.get("heading_style", markdownify.ATX) + # Explicitly cast options to the expected type if necessary + super().__init__(**options) + + def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: + """Same as usual, but be sure to start with a new line""" + if not convert_as_inline: + if not re.search(r"^\n", text): + return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore + + return super().convert_hn(n, el, text, convert_as_inline) # type: ignore + + def convert_a(self, el: Any, text: str, convert_as_inline: bool): + """Same as usual converter, but removes Javascript links and escapes URIs.""" + prefix, suffix, text = markdownify.chomp(text) # type: ignore + if not text: + return "" + href = el.get("href") + title = el.get("title") + + # Escape URIs and skip non-http or file schemes + if href: + try: + parsed_url = urlparse(href) # type: ignore + if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore + return "%s%s%s" % (prefix, text, suffix) + href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore + except ValueError: # It's not clear if this ever gets thrown + return "%s%s%s" % (prefix, text, suffix) + + # For the replacement see #29: text nodes underscores are escaped + if ( + self.options["autolinks"] + and text.replace(r"\_", "_") == href + and not title + and not self.options["default_title"] + ): + # Shortcut syntax + return "<%s>" % href + if self.options["default_title"] and not title: + title = href + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text + + def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: + """Same as usual converter, but removes data URIs""" + + alt = el.attrs.get("alt", None) or "" + src = el.attrs.get("src", None) or "" + title = el.attrs.get("title", None) or "" + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]: + return alt + + # Remove dataURIs + if src.startswith("data:"): + src = src.split(",")[0] + "..." + + return "![%s](%s%s)" % (alt, src, title_part) + + def convert_soup(self, soup: Any) -> str: + return super().convert_soup(soup) # type: ignore + + +class DocumentConverterResult: + """The result of converting a document to text.""" + + def __init__(self, title: Union[str, None] = None, text_content: str = ""): + self.title: Union[str, None] = title + self.text_content: str = text_content + + +class DocumentConverter: + """Abstract superclass of all DocumentConverters.""" + + def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: + raise NotImplementedError() + + +class PlainTextConverter(DocumentConverter): + """Anything with content type text/plain""" + + def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: + # Guess the content type from any file extension that might be around + content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", "")) + + # Only accept text files + if content_type is None: + return None + # elif "text/" not in content_type.lower(): + # return None + + text_content = "" + with open(local_path, "rt", encoding="utf-8") as fh: + text_content = fh.read() + return DocumentConverterResult( + title=None, + text_content=text_content, + ) + + +class HtmlConverter(DocumentConverter): + """Anything with content type text/html""" + + def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: + # Bail if not html + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + + result = None + with open(local_path, "rt", encoding="utf-8") as fh: + result = self._convert(fh.read()) + + return result + + def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: + """Helper function that converts and HTML string.""" + + # Parse the string + soup = BeautifulSoup(html_content, "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Print only the main content + body_elm = soup.find("body") + webpage_text = "" + if body_elm: + webpage_text = _CustomMarkdownify().convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify().convert_soup(soup) + + assert isinstance(webpage_text, str) + + return DocumentConverterResult( + title=None if soup.title is None else soup.title.string, text_content=webpage_text + ) + + +class WikipediaConverter(DocumentConverter): + """Handle Wikipedia pages separately, focusing only on the main document content.""" + + def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: + # Bail if not Wikipedia + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + url = kwargs.get("url", "") + if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url): + return None + + # Parse the file + soup = None + with open(local_path, "rt", encoding="utf-8") as fh: + soup = BeautifulSoup(fh.read(), "html.parser") + + # Remove javascript and style blocks + for script in soup(["script", "style"]): + script.extract() + + # Print only the main content + body_elm = soup.find("div", {"id": "mw-content-text"}) + title_elm = soup.find("span", {"class": "mw-page-title-main"}) + + webpage_text = "" + main_title = None if soup.title is None else soup.title.string + + if body_elm: + # What's the title + if title_elm and len(title_elm) > 0: + main_title = title_elm.string # type: ignore + assert isinstance(main_title, str) + + # Convert the page + webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify().convert_soup(soup) + + return DocumentConverterResult( + title=main_title, + text_content=webpage_text, + ) + + +class YouTubeConverter(DocumentConverter): + """Handle YouTube specially, focusing on the video title, description, and transcript.""" + + def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: + # Bail if not YouTube + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".html", ".htm"]: + return None + url = kwargs.get("url", "") + if not url.startswith("https://www.youtube.com/watch?"): + return None + + # Parse the file + soup = None + with open(local_path, "rt", encoding="utf-8") as fh: + soup = BeautifulSoup(fh.read(), "html.parser") + + # Read the meta tags + assert soup.title is not None and soup.title.string is not None + metadata: Dict[str, str] = {"title": soup.title.string} + for meta in soup(["meta"]): + for a in meta.attrs: + if a in ["itemprop", "property", "name"]: + metadata[meta[a]] = meta.get("content", "") + break + + # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation + try: + for script in soup(["script"]): + content = script.text + if "ytInitialData" in content: + lines = re.split(r"\r?\n", content) + obj_start = lines[0].find("{") + obj_end = lines[0].rfind("}") + if obj_start >= 0 and obj_end >= 0: + data = json.loads(lines[0][obj_start : obj_end + 1]) + attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore + if attrdesc: + metadata["description"] = str(attrdesc["content"]) + break + except Exception: + pass + + # Start preparing the page + webpage_text = "# YouTube\n" + + title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore + assert isinstance(title, str) + + if title: + webpage_text += f"\n## {title}\n" + + stats = "" + views = self._get(metadata, ["interactionCount"]) # type: ignore + if views: + stats += f"- **Views:** {views}\n" + + keywords = self._get(metadata, ["keywords"]) # type: ignore + if keywords: + stats += f"- **Keywords:** {keywords}\n" + + runtime = self._get(metadata, ["duration"]) # type: ignore + if runtime: + stats += f"- **Runtime:** {runtime}\n" + + if len(stats) > 0: + webpage_text += f"\n### Video Metadata\n{stats}\n" + + description = self._get(metadata, ["description", "og:description"]) # type: ignore + if description: + webpage_text += f"\n### Description\n{description}\n" + + transcript_text = "" + parsed_url = urlparse(url) # type: ignore + params = parse_qs(parsed_url.query) # type: ignore + if "v" in params: + assert isinstance(params["v"][0], str) + video_id = str(params["v"][0]) + try: + # Must be a single transcript. + transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore + # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore + # Alternative formatting: + transcript_text = SRTFormatter().format_transcript(transcript) + except Exception: + pass + if transcript_text: + webpage_text += f"\n### Transcript\n{transcript_text}\n" + + title = title if title else soup.title.string + assert isinstance(title, str) + + return DocumentConverterResult( + title=title, + text_content=webpage_text, + ) + + def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]: + for k in keys: + if k in metadata: + return metadata[k] + return default + + def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type + if isinstance(json, list): + for elm in json: + ret = self._findKey(elm, key) + if ret is not None: + return ret + elif isinstance(json, dict): + for k in json: + if k == key: + return json[k] + else: + ret = self._findKey(json[k], key) + if ret is not None: + return ret + return None + + +class PdfConverter(DocumentConverter): + """ + Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a PDF + extension = kwargs.get("file_extension", "") + if extension.lower() != ".pdf": + return None + + return DocumentConverterResult( + title=None, + text_content=pdfminer.high_level.extract_text(local_path), + ) + + +class DocxConverter(HtmlConverter): + """ + Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a DOCX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".docx": + return None + + result = None + with open(local_path, "rb") as docx_file: + result = mammoth.convert_to_html(docx_file) + html_content = result.value + result = self._convert(html_content) + + return result + + +class XlsxConverter(HtmlConverter): + """ + Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".xlsx", ".xls"]: + return None + + sheets = pd.read_excel(local_path, sheet_name=None) + md_content = "" + for s in sheets: + md_content += f"## {s}\n" + html_content = sheets[s].to_html(index=False) + md_content += self._convert(html_content).text_content.strip() + "\n\n" + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class PptxConverter(HtmlConverter): + """ + Converts PPTX files to Markdown. Supports heading, tables and images with alt text. + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a PPTX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".pptx": + return None + + md_content = "" + + presentation = pptx.Presentation(local_path) + slide_num = 0 + for slide in presentation.slides: + slide_num += 1 + + md_content += f"\n\n\n" + + title = slide.shapes.title + for shape in slide.shapes: + # Pictures + if self._is_picture(shape): + # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 + alt_text = "" + try: + alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") + except Exception: + pass + + # A placeholder name + filename = re.sub(r"\W", "", shape.name) + ".jpg" + md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n" + + # Tables + if self._is_table(shape): + html_table = "" + first_row = True + for row in shape.table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + md_content += "\n" + self._convert(html_table).text_content.strip() + "\n" + + # Text areas + elif shape.has_text_frame: + if shape == title: + md_content += "# " + shape.text.lstrip() + "\n" + else: + md_content += shape.text + "\n" + + md_content = md_content.strip() + + if slide.has_notes_slide: + md_content += "\n\n### Notes:\n" + notes_frame = slide.notes_slide.notes_text_frame + if notes_frame is not None: + md_content += notes_frame.text + md_content = md_content.strip() + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _is_picture(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: + return True + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: + if hasattr(shape, "image"): + return True + return False + + def _is_table(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: + return True + return False + + +class MediaConverter(DocumentConverter): + """ + Abstract class for multi-modal media (e.g., images and audio) + """ + + def _get_metadata(self, local_path): + exiftool = shutil.which("exiftool") + if not exiftool: + return None + else: + try: + result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout + return json.loads(result)[0] + except Exception: + return None + + +class WavConverter(MediaConverter): + """ + Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed). + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() != ".wav": + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path) + if metadata: + for f in [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + "Duration", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Transcribe + try: + transcript = self._transcribe_audio(local_path) + md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript) + except Exception: + md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." + + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + def _transcribe_audio(self, local_path) -> str: + recognizer = sr.Recognizer() + with sr.AudioFile(local_path) as source: + audio = recognizer.record(source) + return recognizer.recognize_google(audio).strip() + + +class Mp3Converter(WavConverter): + """ + Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed). + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a MP3 + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".mp3", ".m4a"]: + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path) + if metadata: + for f in [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + "Duration", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Transcribe + handle, temp_path = tempfile.mkstemp(suffix=".wav") + os.close(handle) + try: + if extension.lower() == ".mp3": + sound = pydub.AudioSegment.from_mp3(local_path) + else: + sound = pydub.AudioSegment.from_file(local_path, format="m4a") + sound.export(temp_path, format="wav") + + _args = dict() + _args.update(kwargs) + _args["file_extension"] = ".wav" + + try: + transcript = super()._transcribe_audio(temp_path).strip() + md_content += "\n\n### Audio Transcript:\n" + ( + "[No speech detected]" if transcript == "" else transcript + ) + except Exception: + md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio." + + finally: + os.unlink(temp_path) + + # Return the result + return DocumentConverterResult( + title=None, + text_content=md_content.strip(), + ) + + +class ZipConverter(DocumentConverter): + """ + Extracts ZIP files to a permanent local directory and returns a listing of extracted files. + """ + + def __init__(self, extract_dir: str = "downloads"): + """ + Initialize with path to extraction directory. + + Args: + extract_dir: The directory where files will be extracted. Defaults to "downloads" + """ + self.extract_dir = extract_dir + # Create the extraction directory if it doesn't exist + os.makedirs(self.extract_dir, exist_ok=True) + + def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]: + # Bail if not a ZIP file + extension = kwargs.get("file_extension", "") + if extension.lower() != ".zip": + return None + + # Verify it's actually a ZIP file + if not zipfile.is_zipfile(local_path): + return None + + # Extract all files and build list + extracted_files = [] + with zipfile.ZipFile(local_path, "r") as zip_ref: + # Extract all files + zip_ref.extractall(self.extract_dir) + # Get list of all files + for file_path in zip_ref.namelist(): + # Skip directories + if not file_path.endswith("/"): + extracted_files.append(self.extract_dir + "/" + file_path) + + # Sort files for consistent output + extracted_files.sort() + + # Build the markdown content + md_content = "Downloaded the following files:\n" + for file in extracted_files: + md_content += f"* {file}\n" + + return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip()) + + +class ImageConverter(MediaConverter): + """ + Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured). + """ + + def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: + # Bail if not a XLSX + extension = kwargs.get("file_extension", "") + if extension.lower() not in [".jpg", ".jpeg", ".png"]: + return None + + md_content = "" + + # Add metadata + metadata = self._get_metadata(local_path) + if metadata: + for f in [ + "ImageSize", + "Title", + "Caption", + "Description", + "Keywords", + "Artist", + "Author", + "DateTimeOriginal", + "CreateDate", + "GPSPosition", + ]: + if f in metadata: + md_content += f"{f}: {metadata[f]}\n" + + # Try describing the image with GPTV + mlm_client = kwargs.get("mlm_client") + mlm_model = kwargs.get("mlm_model") + if mlm_client is not None and mlm_model is not None: + md_content += ( + "\n# Description:\n" + + self._get_mlm_description( + local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt") + ).strip() + + "\n" + ) + + return DocumentConverterResult( + title=None, + text_content=md_content, + ) + + def _get_mlm_description(self, local_path, extension, client, model, prompt=None): + if prompt is None or prompt.strip() == "": + prompt = "Write a detailed caption for this image." + + sys.stderr.write(f"MLM Prompt:\n{prompt}\n") + + data_uri = "" + with open(local_path, "rb") as image_file: + content_type, encoding = mimetypes.guess_type("_dummy" + extension) + if content_type is None: + content_type = "image/jpeg" + image_base64 = base64.b64encode(image_file.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{image_base64}" + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": data_uri, + }, + }, + ], + } + ] + + response = client.chat.completions.create(model=model, messages=messages) + return response.choices[0].message.content + + +class FileConversionException(Exception): + pass + + +class UnsupportedFormatException(Exception): + pass + + +class MarkdownConverter: + """(In preview) An extremely simple text-based document reader, suitable for LLM use. + This reader will convert common file-types or webpages to Markdown.""" + + def __init__( + self, + requests_session: Optional[requests.Session] = None, + mlm_client: Optional[Any] = None, + mlm_model: Optional[Any] = None, + ): + if requests_session is None: + self._requests_session = requests.Session() + else: + self._requests_session = requests_session + + self._mlm_client = mlm_client + self._mlm_model = mlm_model + + self._page_converters: List[DocumentConverter] = [] + + # Register converters for successful browsing operations + # Later registrations are tried first / take higher priority than earlier registrations + # To this end, the most specific converters should appear below the most generic converters + self.register_page_converter(PlainTextConverter()) + self.register_page_converter(HtmlConverter()) + self.register_page_converter(WikipediaConverter()) + self.register_page_converter(YouTubeConverter()) + self.register_page_converter(DocxConverter()) + self.register_page_converter(XlsxConverter()) + self.register_page_converter(PptxConverter()) + self.register_page_converter(WavConverter()) + self.register_page_converter(Mp3Converter()) + self.register_page_converter(ImageConverter()) + self.register_page_converter(ZipConverter()) + self.register_page_converter(PdfConverter()) + + def convert( + self, source: Union[str, requests.Response], **kwargs: Any + ) -> DocumentConverterResult: # TODO: deal with kwargs + """ + Args: + - source: can be a string representing a path or url, or a requests.response object + - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.) + """ + + # Local path or url + if isinstance(source, str): + if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"): + return self.convert_url(source, **kwargs) + else: + return self.convert_local(source, **kwargs) + # Request response + elif isinstance(source, requests.Response): + return self.convert_response(source, **kwargs) + + def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs + # Prepare a list of extensions to try (in order of priority) + ext = kwargs.get("file_extension") + extensions = [ext] if ext is not None else [] + + # Get extension alternatives from the path and puremagic + base, ext = os.path.splitext(path) + self._append_ext(extensions, ext) + self._append_ext(extensions, self._guess_ext_magic(path)) + + # Convert + return self._convert(path, extensions, **kwargs) + + # TODO what should stream's type be? + def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs + # Prepare a list of extensions to try (in order of priority) + ext = kwargs.get("file_extension") + extensions = [ext] if ext is not None else [] + + # Save the file locally to a temporary file. It will be deleted before this method exits + handle, temp_path = tempfile.mkstemp() + fh = os.fdopen(handle, "wb") + result = None + try: + # Write to the temporary file + content = stream.read() + if isinstance(content, str): + fh.write(content.encode("utf-8")) + else: + fh.write(content) + fh.close() + + # Use puremagic to check for more extension options + self._append_ext(extensions, self._guess_ext_magic(temp_path)) + + # Convert + result = self._convert(temp_path, extensions, **kwargs) + # Clean up + finally: + try: + fh.close() + except Exception: + pass + os.unlink(temp_path) + + return result + + def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type + # Send a HTTP request to the URL + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent}) + response.raise_for_status() + return self.convert_response(response, **kwargs) + + def convert_response( + self, response: requests.Response, **kwargs: Any + ) -> DocumentConverterResult: # TODO fix kwargs type + # Prepare a list of extensions to try (in order of priority) + ext = kwargs.get("file_extension") + extensions = [ext] if ext is not None else [] + + # Guess from the mimetype + content_type = response.headers.get("content-type", "").split(";")[0] + self._append_ext(extensions, mimetypes.guess_extension(content_type)) + + # Read the content disposition if there is one + content_disposition = response.headers.get("content-disposition", "") + m = re.search(r"filename=([^;]+)", content_disposition) + if m: + base, ext = os.path.splitext(m.group(1).strip("\"'")) + self._append_ext(extensions, ext) + + # Read from the extension from the path + base, ext = os.path.splitext(urlparse(response.url).path) + self._append_ext(extensions, ext) + + # Save the file locally to a temporary file. It will be deleted before this method exits + handle, temp_path = tempfile.mkstemp() + fh = os.fdopen(handle, "wb") + result = None + try: + # Download the file + for chunk in response.iter_content(chunk_size=512): + fh.write(chunk) + fh.close() + + # Use puremagic to check for more extension options + self._append_ext(extensions, self._guess_ext_magic(temp_path)) + + # Convert + result = self._convert(temp_path, extensions, url=response.url) + except Exception as e: + print(f"Error in converting: {e}") + + # Clean up + finally: + try: + fh.close() + except Exception: + pass + os.unlink(temp_path) + + return result + + def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult: + error_trace = "" + for ext in extensions + [None]: # Try last with no extension + for converter in self._page_converters: + _kwargs = copy.deepcopy(kwargs) + + # Overwrite file_extension appropriately + if ext is None: + if "file_extension" in _kwargs: + del _kwargs["file_extension"] + else: + _kwargs.update({"file_extension": ext}) + + # Copy any additional global options + if "mlm_client" not in _kwargs and self._mlm_client is not None: + _kwargs["mlm_client"] = self._mlm_client + + if "mlm_model" not in _kwargs and self._mlm_model is not None: + _kwargs["mlm_model"] = self._mlm_model + + # If we hit an error log it and keep trying + try: + res = converter.convert(local_path, **_kwargs) + except Exception: + error_trace = ("\n\n" + traceback.format_exc()).strip() + + if res is not None: + # Normalize the content + res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)]) + res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content) + + # Todo + return res + + # If we got this far without success, report any exceptions + if len(error_trace) > 0: + raise FileConversionException( + f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}" + ) + + # Nothing can handle it! + raise UnsupportedFormatException( + f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported." + ) + + def _append_ext(self, extensions, ext): + """Append a unique non-None, non-empty extension to a list of extensions.""" + if ext is None: + return + ext = ext.strip() + if ext == "": + return + # if ext not in extensions: + if True: + extensions.append(ext) + + def _guess_ext_magic(self, path): + """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes.""" + # Use puremagic to guess + try: + guesses = puremagic.magic_file(path) + if len(guesses) > 0: + ext = guesses[0].extension.strip() + if len(ext) > 0: + return ext + except FileNotFoundError: + pass + except IsADirectoryError: + pass + except PermissionError: + pass + return None + + def register_page_converter(self, converter: DocumentConverter) -> None: + """Register a page text converter.""" + self._page_converters.insert(0, converter) diff --git a/examples/open_deep_research/scripts/reformulator.py b/examples/open_deep_research/scripts/reformulator.py new file mode 100644 index 000000000..db41704d8 --- /dev/null +++ b/examples/open_deep_research/scripts/reformulator.py @@ -0,0 +1,86 @@ +# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource! +# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py +import copy + +from smolagents.models import MessageRole, Model + + +def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str: + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": f"""Earlier you were asked the following: + +{original_task} + +Your team then worked diligently to address that request. Read below a transcript of that conversation:""", + } + ], + } + ] + + # The first message just repeats the question, so remove it + # if len(inner_messages) > 1: + # del inner_messages[0] + + # copy them to this context + try: + for message in inner_messages: + if not message.get("content"): + continue + message = copy.deepcopy(message) + message["role"] = MessageRole.USER + messages.append(message) + except Exception: + messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}] + + # ask for the final answer + messages.append( + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": f""" +Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience: + +{original_task} + +To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER] +Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. +ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) +If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise. +If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. +If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. +If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine' +""", + } + ], + } + ) + + response = reformulation_model(messages).content + + final_answer = response.split("FINAL ANSWER: ")[-1].strip() + print("> Reformulated answer: ", final_answer) + + # if "unable to determine" in final_answer.lower(): + # messages.append({"role": MessageRole.ASSISTANT, "content": response }) + # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """ + # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation. + + # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS] + # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc. + # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) + # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise. + # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. + # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. + # """.strip()}]}) + + # response = model(messages).content + # print("\n>>>Making an educated guess.\n", response) + # final_answer = response.split("EDUCATED GUESS: ")[-1].strip() + return final_answer diff --git a/examples/open_deep_research/scripts/run_agents.py b/examples/open_deep_research/scripts/run_agents.py new file mode 100644 index 000000000..37da8a40e --- /dev/null +++ b/examples/open_deep_research/scripts/run_agents.py @@ -0,0 +1,87 @@ +import json +import os +import shutil +import textwrap +from pathlib import Path + +# import tqdm.asyncio +from smolagents.utils import AgentError + + +def serialize_agent_error(obj): + if isinstance(obj, AgentError): + return {"error_type": obj.__class__.__name__, "message": obj.message} + else: + return str(obj) + + +def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str: + prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question: +{question}. But do not try to answer the question directly! +Do not add any information that is not present in the image.""" + return visual_inspection_tool(image_path=file_name, question=prompt) + + +def get_document_description(file_path: str, question: str, document_inspection_tool) -> str: + prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question: +{question}. But do not try to answer the question directly! +Do not add any information that is not present in the document.""" + return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt) + + +def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool): + file_extension = file_path.split(".")[-1] + if file_extension in ["png", "jpg", "jpeg"]: + file_description = f" - Attached image: {file_path}" + file_description += ( + f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}" + ) + return file_description + elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]: + file_description = f" - Attached document: {file_path}" + image_path = file_path.split(".")[0] + ".png" + if os.path.exists(image_path): + description = get_image_description(image_path, question, visual_inspection_tool) + else: + description = get_document_description(file_path, question, document_inspection_tool) + file_description += f"\n -> File description: {description}" + return file_description + elif file_extension in ["mp3", "m4a", "wav"]: + return f" - Attached audio: {file_path}" + else: + return f" - Attached file: {file_path}" + + +def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool): + folder_path = file_path.replace(".zip", "") + os.makedirs(folder_path, exist_ok=True) + shutil.unpack_archive(file_path, folder_path) + + prompt_use_files = "" + for root, dirs, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + prompt_use_files += "\n" + textwrap.indent( + get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool), + prefix=" ", + ) + return prompt_use_files + + +def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]): + f = base_filename.parent / f"{base_filename.stem}_answers.jsonl" + done = set() + if f.exists(): + with open(f, encoding="utf-8") as fh: + done = {json.loads(line)["task_id"] for line in fh if line.strip()} + + tasks = [] + for i in range(total): + task_id = int(data[i]["task_id"]) + if task_id not in done: + if tasks_ids is not None: + if task_id in tasks_ids: + tasks.append(data[i]) + else: + tasks.append(data[i]) + return tasks diff --git a/examples/open_deep_research/scripts/text_inspector_tool.py b/examples/open_deep_research/scripts/text_inspector_tool.py new file mode 100644 index 000000000..056168cee --- /dev/null +++ b/examples/open_deep_research/scripts/text_inspector_tool.py @@ -0,0 +1,122 @@ +from typing import Optional + +from smolagents import Tool +from smolagents.models import MessageRole, Model + +from .mdconvert import MarkdownConverter + + +class TextInspectorTool(Tool): + name = "inspect_file_as_text" + description = """ +You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it. +This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES.""" + + inputs = { + "file_path": { + "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!", + "type": "string", + }, + "question": { + "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.", + "type": "string", + "nullable": True, + }, + } + output_type = "string" + md_converter = MarkdownConverter() + + def __init__(self, model: Model, text_limit: int): + super().__init__() + self.model = model + self.text_limit = text_limit + + def forward_initial_exam_mode(self, file_path, question): + result = self.md_converter.convert(file_path) + + if file_path[-4:] in [".png", ".jpg"]: + raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!") + + if ".zip" in file_path: + return result.text_content + + if not question: + return result.text_content + + if len(result.text_content) < 4000: + return "Document content: " + result.text_content + + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": "Here is a file:\n### " + + str(result.title) + + "\n\n" + + result.text_content[: self.text_limit], + } + ], + }, + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: " + + question + + "\n\nDon't answer the question yourself! Just provide useful notes on the document", + } + ], + }, + ] + return self.model(messages).content + + def forward(self, file_path, question: Optional[str] = None) -> str: + result = self.md_converter.convert(file_path) + + if file_path[-4:] in [".png", ".jpg"]: + raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!") + + if ".zip" in file_path: + return result.text_content + + if not question: + return result.text_content + + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": "You will have to write a short caption for this file, then answer this question:" + + question, + } + ], + }, + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": "Here is the complete file:\n### " + + str(result.title) + + "\n\n" + + result.text_content[: self.text_limit], + } + ], + }, + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'." + + question, + } + ], + }, + ] + return self.model(messages).content diff --git a/examples/open_deep_research/scripts/text_web_browser.py b/examples/open_deep_research/scripts/text_web_browser.py new file mode 100644 index 000000000..ef40f8551 --- /dev/null +++ b/examples/open_deep_research/scripts/text_web_browser.py @@ -0,0 +1,563 @@ +# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource! +# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py +import mimetypes +import os +import pathlib +import re +import time +import uuid +from typing import Any, Dict, List, Optional, Tuple, Union +from urllib.parse import unquote, urljoin, urlparse + +import pathvalidate +import requests +from serpapi import GoogleSearch + +from smolagents import Tool + +from .cookies import COOKIES +from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException + + +class SimpleTextBrowser: + """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use.""" + + def __init__( + self, + start_page: Optional[str] = None, + viewport_size: Optional[int] = 1024 * 8, + downloads_folder: Optional[Union[str, None]] = None, + serpapi_key: Optional[Union[str, None]] = None, + request_kwargs: Optional[Union[Dict[str, Any], None]] = None, + ): + self.start_page: str = start_page if start_page else "about:blank" + self.viewport_size = viewport_size # Applies only to the standard uri types + self.downloads_folder = downloads_folder + self.history: List[Tuple[str, float]] = list() + self.page_title: Optional[str] = None + self.viewport_current_page = 0 + self.viewport_pages: List[Tuple[int, int]] = list() + self.set_address(self.start_page) + self.serpapi_key = serpapi_key + self.request_kwargs = request_kwargs + self.request_kwargs["cookies"] = COOKIES + self._mdconvert = MarkdownConverter() + self._page_content: str = "" + + self._find_on_page_query: Union[str, None] = None + self._find_on_page_last_result: Union[int, None] = None # Location of the last result + + @property + def address(self) -> str: + """Return the address of the current page.""" + return self.history[-1][0] + + def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None: + # TODO: Handle anchors + self.history.append((uri_or_path, time.time())) + + # Handle special URIs + if uri_or_path == "about:blank": + self._set_page_content("") + elif uri_or_path.startswith("google:"): + self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year) + else: + if ( + not uri_or_path.startswith("http:") + and not uri_or_path.startswith("https:") + and not uri_or_path.startswith("file:") + ): + if len(self.history) > 1: + prior_address = self.history[-2][0] + uri_or_path = urljoin(prior_address, uri_or_path) + # Update the address with the fully-qualified path + self.history[-1] = (uri_or_path, self.history[-1][1]) + self._fetch_page(uri_or_path) + + self.viewport_current_page = 0 + self.find_on_page_query = None + self.find_on_page_viewport = None + + @property + def viewport(self) -> str: + """Return the content of the current viewport.""" + bounds = self.viewport_pages[self.viewport_current_page] + return self.page_content[bounds[0] : bounds[1]] + + @property + def page_content(self) -> str: + """Return the full contents of the current page.""" + return self._page_content + + def _set_page_content(self, content: str) -> None: + """Sets the text content of the current page.""" + self._page_content = content + self._split_pages() + if self.viewport_current_page >= len(self.viewport_pages): + self.viewport_current_page = len(self.viewport_pages) - 1 + + def page_down(self) -> None: + self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1) + + def page_up(self) -> None: + self.viewport_current_page = max(self.viewport_current_page - 1, 0) + + def find_on_page(self, query: str) -> Union[str, None]: + """Searches for the query from the current viewport forward, looping back to the start if necessary.""" + + # Did we get here via a previous find_on_page search with the same query? + # If so, map to find_next + if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result: + return self.find_next() + + # Ok it's a new search start from the current viewport + self._find_on_page_query = query + viewport_match = self._find_next_viewport(query, self.viewport_current_page) + if viewport_match is None: + self._find_on_page_last_result = None + return None + else: + self.viewport_current_page = viewport_match + self._find_on_page_last_result = viewport_match + return self.viewport + + def find_next(self) -> Union[str, None]: + """Scroll to the next viewport that matches the query""" + + if self._find_on_page_query is None: + return None + + starting_viewport = self._find_on_page_last_result + if starting_viewport is None: + starting_viewport = 0 + else: + starting_viewport += 1 + if starting_viewport >= len(self.viewport_pages): + starting_viewport = 0 + + viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport) + if viewport_match is None: + self._find_on_page_last_result = None + return None + else: + self.viewport_current_page = viewport_match + self._find_on_page_last_result = viewport_match + return self.viewport + + def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]: + """Search for matches between the starting viewport looping when reaching the end.""" + + if query is None: + return None + + # Normalize the query, and convert to a regular expression + nquery = re.sub(r"\*", "__STAR__", query) + nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " " + nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word + nquery = nquery.replace("__STAR__", ".*").lower() + + if nquery.strip() == "": + return None + + idxs = list() + idxs.extend(range(starting_viewport, len(self.viewport_pages))) + idxs.extend(range(0, starting_viewport)) + + for i in idxs: + bounds = self.viewport_pages[i] + content = self.page_content[bounds[0] : bounds[1]] + + # TODO: Remove markdown links and images + ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " " + if re.search(nquery, ncontent): + return i + + return None + + def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str: + """Update the address, visit the page, and return the content of the viewport.""" + self.set_address(path_or_uri, filter_year=filter_year) + return self.viewport + + def _split_pages(self) -> None: + # Do not split search results + if self.address.startswith("google:"): + self.viewport_pages = [(0, len(self._page_content))] + return + + # Handle empty pages + if len(self._page_content) == 0: + self.viewport_pages = [(0, 0)] + return + + # Break the viewport into pages + self.viewport_pages = [] + start_idx = 0 + while start_idx < len(self._page_content): + end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator] + # Adjust to end on a space + while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]: + end_idx += 1 + self.viewport_pages.append((start_idx, end_idx)) + start_idx = end_idx + + def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None: + if self.serpapi_key is None: + raise ValueError("Missing SerpAPI key.") + + params = { + "engine": "google", + "q": query, + "api_key": self.serpapi_key, + } + if filter_year is not None: + params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}" + + search = GoogleSearch(params) + results = search.get_dict() + self.page_title = f"{query} - Search" + if "organic_results" not in results.keys(): + raise Exception(f"No results found for query: '{query}'. Use a less specific query.") + if len(results["organic_results"]) == 0: + year_filter_message = f" with filter year={filter_year}" if filter_year is not None else "" + self._set_page_content( + f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter." + ) + return + + def _prev_visit(url): + for i in range(len(self.history) - 1, -1, -1): + if self.history[i][0] == url: + return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n" + return "" + + web_snippets: List[str] = list() + idx = 0 + if "organic_results" in results: + for page in results["organic_results"]: + idx += 1 + date_published = "" + if "date" in page: + date_published = "\nDate published: " + page["date"] + + source = "" + if "source" in page: + source = "\nSource: " + page["source"] + + snippet = "" + if "snippet" in page: + snippet = "\n" + page["snippet"] + + redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}" + + redacted_version = redacted_version.replace("Your browser can't play this video.", "") + web_snippets.append(redacted_version) + + content = ( + f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n" + + "\n\n".join(web_snippets) + ) + + self._set_page_content(content) + + def _fetch_page(self, url: str) -> None: + download_path = "" + try: + if url.startswith("file://"): + download_path = os.path.normcase(os.path.normpath(unquote(url[7:]))) + res = self._mdconvert.convert_local(download_path) + self.page_title = res.title + self._set_page_content(res.text_content) + else: + # Prepare the request parameters + request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {} + request_kwargs["stream"] = True + + # Send a HTTP request to the URL + response = requests.get(url, **request_kwargs) + response.raise_for_status() + + # If the HTTP request was successful + content_type = response.headers.get("content-type", "") + + # Text or HTML + if "text/" in content_type.lower(): + res = self._mdconvert.convert_response(response) + self.page_title = res.title + self._set_page_content(res.text_content) + # A download + else: + # Try producing a safe filename + fname = None + download_path = None + try: + fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip() + download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) + + suffix = 0 + while os.path.exists(download_path) and suffix < 1000: + suffix += 1 + base, ext = os.path.splitext(fname) + new_fname = f"{base}__{suffix}{ext}" + download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname)) + + except NameError: + pass + + # No suitable name, so make one + if fname is None: + extension = mimetypes.guess_extension(content_type) + if extension is None: + extension = ".download" + fname = str(uuid.uuid4()) + extension + download_path = os.path.abspath(os.path.join(self.downloads_folder, fname)) + + # Open a file for writing + with open(download_path, "wb") as fh: + for chunk in response.iter_content(chunk_size=512): + fh.write(chunk) + + # Render it + local_uri = pathlib.Path(download_path).as_uri() + self.set_address(local_uri) + + except UnsupportedFormatException as e: + print(e) + self.page_title = ("Download complete.",) + self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'") + except FileConversionException as e: + print(e) + self.page_title = ("Download complete.",) + self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'") + except FileNotFoundError: + self.page_title = "Error 404" + self._set_page_content(f"## Error 404\n\nFile not found: {download_path}") + except requests.exceptions.RequestException as request_exception: + try: + self.page_title = f"Error {response.status_code}" + + # If the error was rendered in HTML we might as well render it + content_type = response.headers.get("content-type", "") + if content_type is not None and "text/html" in content_type.lower(): + res = self._mdconvert.convert(response) + self.page_title = f"Error {response.status_code}" + self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}") + else: + text = "" + for chunk in response.iter_content(chunk_size=512, decode_unicode=True): + text += chunk + self.page_title = f"Error {response.status_code}" + self._set_page_content(f"## Error {response.status_code}\n\n{text}") + except NameError: + self.page_title = "Error" + self._set_page_content(f"## Error\n\n{str(request_exception)}") + + def _state(self) -> Tuple[str, str]: + header = f"Address: {self.address}\n" + if self.page_title is not None: + header += f"Title: {self.page_title}\n" + + current_page = self.viewport_current_page + total_pages = len(self.viewport_pages) + + address = self.address + for i in range(len(self.history) - 2, -1, -1): # Start from the second last + if self.history[i][0] == address: + header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n" + break + + header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n" + return (header, self.viewport) + + +class SearchInformationTool(Tool): + name = "web_search" + description = "Perform a web search query (think a google search) and returns the search results." + inputs = {"query": {"type": "string", "description": "The web search query to perform."}} + inputs["filter_year"] = { + "type": "string", + "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!", + "nullable": True, + } + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, query: str, filter_year: Optional[int] = None) -> str: + self.browser.visit_page(f"google: {query}", filter_year=filter_year) + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class VisitTool(Tool): + name = "visit_page" + description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript." + inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}} + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, url: str) -> str: + self.browser.visit_page(url) + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class DownloadTool(Tool): + name = "download_file" + description = """ +Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".png", ".docx"] +After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it. +DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead.""" + inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}} + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, url: str) -> str: + if "arxiv" in url: + url = url.replace("abs", "pdf") + response = requests.get(url) + content_type = response.headers.get("content-type", "") + extension = mimetypes.guess_extension(content_type) + if extension and isinstance(extension, str): + new_path = f"./downloads/file{extension}" + else: + new_path = "./downloads/file.object" + + with open(new_path, "wb") as f: + f.write(response.content) + + if "pdf" in extension or "txt" in extension or "htm" in extension: + raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.") + + return f"File was downloaded and saved under path {new_path}." + + +class ArchiveSearchTool(Tool): + name = "find_archived_url" + description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date." + inputs = { + "url": {"type": "string", "description": "The url you need the archive for."}, + "date": { + "type": "string", + "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.", + }, + } + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, url, date) -> str: + no_timestamp_url = f"https://archive.org/wayback/available?url={url}" + archive_url = no_timestamp_url + f"×tamp={date}" + response = requests.get(archive_url).json() + response_notimestamp = requests.get(no_timestamp_url).json() + if "archived_snapshots" in response and "closest" in response["archived_snapshots"]: + closest = response["archived_snapshots"]["closest"] + print("Archive found!", closest) + + elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]: + closest = response_notimestamp["archived_snapshots"]["closest"] + print("Archive found!", closest) + else: + raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.") + target_url = closest["url"] + self.browser.visit_page(target_url) + header, content = self.browser._state() + return ( + f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n" + + header.strip() + + "\n=======================\n" + + content + ) + + +class PageUpTool(Tool): + name = "page_up" + description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content." + inputs = {} + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self) -> str: + self.browser.page_up() + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class PageDownTool(Tool): + name = "page_down" + description = ( + "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content." + ) + inputs = {} + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self) -> str: + self.browser.page_down() + header, content = self.browser._state() + return header.strip() + "\n=======================\n" + content + + +class FinderTool(Tool): + name = "find_on_page_ctrl_f" + description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F." + inputs = { + "search_string": { + "type": "string", + "description": "The string to search for on the page. This search string supports wildcards like '*'", + } + } + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self, search_string: str) -> str: + find_result = self.browser.find_on_page(search_string) + header, content = self.browser._state() + + if find_result is None: + return ( + header.strip() + + f"\n=======================\nThe search string '{search_string}' was not found on this page." + ) + else: + return header.strip() + "\n=======================\n" + content + + +class FindNextTool(Tool): + name = "find_next" + description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search." + inputs = {} + output_type = "string" + + def __init__(self, browser): + super().__init__() + self.browser = browser + + def forward(self) -> str: + find_result = self.browser.find_next() + header, content = self.browser._state() + + if find_result is None: + return header.strip() + "\n=======================\nThe search string was not found on this page." + else: + return header.strip() + "\n=======================\n" + content diff --git a/examples/open_deep_research/scripts/visual_qa.py b/examples/open_deep_research/scripts/visual_qa.py new file mode 100644 index 000000000..84d240b66 --- /dev/null +++ b/examples/open_deep_research/scripts/visual_qa.py @@ -0,0 +1,187 @@ +import base64 +import json +import mimetypes +import os +import uuid +from io import BytesIO +from typing import Optional + +import requests +from dotenv import load_dotenv +from huggingface_hub import InferenceClient +from PIL import Image +from transformers import AutoProcessor + +from smolagents import Tool, tool + + +load_dotenv(override=True) + +idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty") + + +def process_images_and_text(image_path, query, client): + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": query}, + ], + }, + ] + + prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True) + + # load images from local directory + + # encode images to strings which can be sent to the endpoint + def encode_local_image(image_path): + # load image + image = Image.open(image_path).convert("RGB") + + # Convert the image to a base64 string + buffer = BytesIO() + image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG) + base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8") + + # add string formatting required by the endpoint + image_string = f"data:image/jpeg;base64,{base64_image}" + + return image_string + + image_string = encode_local_image(image_path) + prompt_with_images = prompt_with_template.replace("", "![]({}) ").format(image_string) + + payload = { + "inputs": prompt_with_images, + "parameters": { + "return_full_text": False, + "max_new_tokens": 200, + }, + } + + return json.loads(client.post(json=payload).decode())[0] + + +# Function to encode the image +def encode_image(image_path): + if image_path.startswith("http"): + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" + request_kwargs = { + "headers": {"User-Agent": user_agent}, + "stream": True, + } + + # Send a HTTP request to the URL + response = requests.get(image_path, **request_kwargs) + response.raise_for_status() + content_type = response.headers.get("content-type", "") + + extension = mimetypes.guess_extension(content_type) + if extension is None: + extension = ".download" + + fname = str(uuid.uuid4()) + extension + download_path = os.path.abspath(os.path.join("downloads", fname)) + + with open(download_path, "wb") as fh: + for chunk in response.iter_content(chunk_size=512): + fh.write(chunk) + + image_path = download_path + + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + +headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"} + + +def resize_image(image_path): + img = Image.open(image_path) + width, height = img.size + img = img.resize((int(width / 2), int(height / 2))) + new_image_path = f"resized_{image_path}" + img.save(new_image_path) + return new_image_path + + +class VisualQATool(Tool): + name = "visualizer" + description = "A tool that can answer questions about attached images." + inputs = { + "image_path": { + "description": "The path to the image on which to answer the question", + "type": "string", + }, + "question": {"description": "the question to answer", "type": "string", "nullable": True}, + } + output_type = "string" + + client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty") + + def forward(self, image_path: str, question: Optional[str] = None) -> str: + output = "" + add_note = False + if not question: + add_note = True + question = "Please write a detailed caption for this image." + try: + output = process_images_and_text(image_path, question, self.client) + except Exception as e: + print(e) + if "Payload Too Large" in str(e): + new_image_path = resize_image(image_path) + output = process_images_and_text(new_image_path, question, self.client) + + if add_note: + output = ( + f"You did not provide a particular question, so here is a detailed caption for the image: {output}" + ) + + return output + + +@tool +def visualizer(image_path: str, question: Optional[str] = None) -> str: + """A tool that can answer questions about attached images. + + Args: + image_path: The path to the image on which to answer the question. This should be a local path to downloaded image. + question: The question to answer. + """ + + add_note = False + if not question: + add_note = True + question = "Please write a detailed caption for this image." + if not isinstance(image_path, str): + raise Exception("You should provide at least `image_path` string argument to this tool!") + + mime_type, _ = mimetypes.guess_type(image_path) + base64_image = encode_image(image_path) + + payload = { + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}, + ], + } + ], + "max_tokens": 1000, + } + response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) + try: + output = response.json()["choices"][0]["message"]["content"] + except Exception: + raise Exception(f"Response format unexpected: {response.json()}") + + if add_note: + output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}" + + return output diff --git a/examples/open_deep_research/visual_vs_text_browser.ipynb b/examples/open_deep_research/visual_vs_text_browser.ipynb new file mode 100644 index 000000000..9bb4ee8dc --- /dev/null +++ b/examples/open_deep_research/visual_vs_text_browser.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compare a text-based vs a vision-based browser\n", + "\n", + "Warning: this notebook is experimental, it probably won't work out of the box!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install \"smolagents[litellm]\" -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datasets\n", + "\n", + "\n", + "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "to_keep = [\n", + " \"What's the last line of the rhyme under the flavor\",\n", + " 'Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus',\n", + " \"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.\",\n", + " \"Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?\",\n", + " \"The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.\",\n", + " \"I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.\",\n", + " \"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's \",\n", + " \"Under DDC 633 on Bielefeld University Library's BASE, as of 2020\",\n", + " \"In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?\",\n", + " \"The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators\",\n", + " \"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?\",\n", + " 'In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content',\n", + " \"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?\",\n", + "]\n", + "eval_ds = eval_ds.filter(lambda row: any([el in row[\"Question\"] for el in to_keep]))\n", + "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "from huggingface_hub import login\n", + "\n", + "\n", + "load_dotenv(override=True)\n", + "\n", + "login(os.getenv(\"HF_TOKEN\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text browser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.run_agents import answer_questions\n", + "from scripts.text_inspector_tool import TextInspectorTool\n", + "from scripts.text_web_browser import (\n", + " ArchiveSearchTool,\n", + " FinderTool,\n", + " FindNextTool,\n", + " NavigationalSearchTool,\n", + " PageDownTool,\n", + " PageUpTool,\n", + " SearchInformationTool,\n", + " VisitTool,\n", + ")\n", + "from scripts.visual_qa import VisualQAGPT4Tool\n", + "\n", + "from smolagents import CodeAgent, LiteLLMModel\n", + "\n", + "\n", + "proprietary_model = LiteLLMModel(\"gpt-4o\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### BUILD AGENTS & TOOLS\n", + "\n", + "WEB_TOOLS = [\n", + " SearchInformationTool(),\n", + " NavigationalSearchTool(),\n", + " VisitTool(),\n", + " PageUpTool(),\n", + " PageDownTool(),\n", + " FinderTool(),\n", + " FindNextTool(),\n", + " ArchiveSearchTool(),\n", + "]\n", + "\n", + "\n", + "surfer_agent = CodeAgent(\n", + " model=proprietary_model,\n", + " tools=WEB_TOOLS,\n", + " max_steps=20,\n", + " verbosity_level=2,\n", + ")\n", + "\n", + "results_text = answer_questions(\n", + " eval_ds,\n", + " surfer_agent,\n", + " \"code_gpt4o_27-01_text\",\n", + " reformulation_model=proprietary_model,\n", + " output_folder=\"output_browsers\",\n", + " visual_inspection_tool=VisualQAGPT4Tool(),\n", + " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vision browser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install helium -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scripts.visual_qa import VisualQAGPT4Tool\n", + "\n", + "from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n", + "from smolagents.vision_web_browser import (\n", + " close_popups,\n", + " go_back,\n", + " helium_instructions,\n", + " initialize_agent,\n", + " save_screenshot,\n", + " search_item_ctrl_f,\n", + ")\n", + "\n", + "\n", + "proprietary_model = LiteLLMModel(\"gpt-4o\")\n", + "vision_browser_agent = initialize_agent(proprietary_model)\n", + "### BUILD AGENTS & TOOLS\n", + "\n", + "CodeAgent(\n", + " tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n", + " model=proprietary_model,\n", + " additional_authorized_imports=[\"helium\"],\n", + " step_callbacks=[save_screenshot],\n", + " max_steps=20,\n", + " verbosity_level=2,\n", + ")\n", + "\n", + "results_vision = answer_questions(\n", + " eval_ds,\n", + " vision_browser_agent,\n", + " \"code_gpt4o_27-01_vision\",\n", + " reformulation_model=proprietary_model,\n", + " output_folder=\"output_browsers\",\n", + " visual_inspection_tool=VisualQAGPT4Tool(),\n", + " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n", + " postprompt=helium_instructions\n", + " + \"Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Browser-use browser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install browser-use lxml_html_clean -q\n", + "!playwright install" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import asyncio\n", + "\n", + "import nest_asyncio\n", + "\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "from browser_use import Agent\n", + "from dotenv import load_dotenv\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "\n", + "load_dotenv()\n", + "\n", + "\n", + "class BrowserUseAgent:\n", + " logs = []\n", + "\n", + " def write_inner_memory_from_logs(self, summary_mode):\n", + " return self.results\n", + "\n", + " def run(self, task, **kwargs):\n", + " agent = Agent(\n", + " task=task,\n", + " llm=ChatOpenAI(model=\"gpt-4o\"),\n", + " )\n", + " self.results = asyncio.get_event_loop().run_until_complete(agent.run())\n", + " return self.results.history[-1].result[0].extracted_content\n", + "\n", + "\n", + "browser_use_agent = BrowserUseAgent()\n", + "\n", + "results_browseruse = answer_questions(\n", + " eval_ds,\n", + " browser_use_agent,\n", + " \"gpt-4o_27-01_browseruse\",\n", + " reformulation_model=proprietary_model,\n", + " output_folder=\"output_browsers\",\n", + " visual_inspection_tool=VisualQAGPT4Tool(),\n", + " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n", + " postprompt=\"\",\n", + " run_simple=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from scripts.gaia_scorer import question_scorer\n", + "\n", + "\n", + "results_vision, results_text, results_browseruse = (\n", + " pd.DataFrame(results_vision),\n", + " pd.DataFrame(results_text),\n", + " pd.DataFrame(results_browseruse),\n", + ")\n", + "\n", + "results_vision[\"is_correct\"] = results_vision.apply(\n", + " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n", + ")\n", + "results_text[\"is_correct\"] = results_text.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n", + "results_browseruse[\"is_correct\"] = results_browseruse.apply(\n", + " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = pd.concat([results_vision, results_text, results_browseruse])\n", + "results.groupby(\"agent_name\")[\"is_correct\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "correct_vision_results = results_vision.loc[results_vision[\"is_correct\"]]\n", + "correct_vision_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "false_text_results = results_text.loc[~results_text[\"is_correct\"]]\n", + "false_text_results" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gaia", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyproject.toml b/pyproject.toml index 22c1252e4..ab323f8a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "smolagents" -version = "1.6.0.dev" +version = "1.10.0.dev0" description = "🤗 smolagents: a barebones library for agents. Agents write python code to call tools or orchestrate other agents." authors = [ { name="Aymeric Roucher", email="aymeric@hf.co" }, { name="Thomas Wolf"}, @@ -12,7 +12,7 @@ authors = [ readme = "README.md" requires-python = ">=3.10" dependencies = [ - "huggingface-hub>=0.24.0", + "huggingface-hub>=0.28.0", "requests>=2.32.3", "rich>=13.9.4", "pandas>=2.2.3", @@ -20,6 +20,7 @@ dependencies = [ "pillow>=11.0.0", "markdownify>=0.14.1", "duckduckgo-search>=6.3.7", + "python-dotenv" ] [project.optional-dependencies] @@ -31,33 +32,42 @@ audio = [ "soundfile", "smolagents[torch]", ] -transformers = [ - "accelerate", - "transformers>=4.0.0", - "smolagents[torch]", -] e2b = [ "e2b-code-interpreter>=1.0.3", "python-dotenv>=1.0.1", ] gradio = [ - "gradio>=5.8.0", + "gradio>=5.13.2", ] litellm = [ - "litellm>=1.55.10", + "litellm>=1.60.2", ] mcp = [ "mcpadapt>=0.0.6", "mcp", ] +mlx-lm = [ + "mlx-lm" +] openai = [ "openai>=1.58.1" ] -quality = [ - "ruff>=0.9.0", +telemetry = [ + "arize-phoenix", + "opentelemetry-sdk", + "opentelemetry-exporter-otlp", + "openinference-instrumentation-smolagents>=0.1.4" +] +transformers = [ + "accelerate", + "transformers>=4.0.0,<4.49.0", + "smolagents[torch]", ] all = [ - "smolagents[accelerate,audio,e2b,gradio,litellm,mcp,openai,transformers]", + "smolagents[audio,e2b,gradio,litellm,mcp,openai,telemetry,transformers]", +] +quality = [ + "ruff>=0.9.0", ] test = [ "ipython>=8.31.0", # for interactive environment tests @@ -91,3 +101,10 @@ lint.select = ["E", "F", "I", "W"] [tool.ruff.lint.isort] known-first-party = ["smolagents"] lines-after-imports = 2 + +[tool.setuptools.package-data] +"smolagents.prompts" = ["*.yaml"] + +[project.scripts] +smolagent = "smolagents.cli:main" +webagent = "smolagents.vision_web_browser:main" \ No newline at end of file diff --git a/src/smolagents/__init__.py b/src/smolagents/__init__.py index 8b417d5b7..a1321eb1b 100644 --- a/src/smolagents/__init__.py +++ b/src/smolagents/__init__.py @@ -14,16 +14,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.6.0.dev" +__version__ = "1.10.0.dev0" -from .agents import * +from .agent_types import * # noqa: I001 +from .agents import * # Above noqa avoids a circular dependency due to cli.py from .default_tools import * from .e2b_executor import * from .gradio_ui import * from .local_python_executor import * +from .memory import * from .models import * from .monitoring import * -from .prompts import * from .tools import * -from .types import * from .utils import * +from .cli import * diff --git a/src/smolagents/_function_type_hints_utils.py b/src/smolagents/_function_type_hints_utils.py index 5eb950280..dddd90d0c 100644 --- a/src/smolagents/_function_type_hints_utils.py +++ b/src/smolagents/_function_type_hints_utils.py @@ -24,7 +24,6 @@ import inspect import json -import os import re import types from copy import copy @@ -46,34 +45,31 @@ from .utils import _is_pillow_available -def get_imports(filename: Union[str, os.PathLike]) -> List[str]: +def get_imports(code: str) -> List[str]: """ - Extracts all the libraries (not relative imports this time) that are imported in a file. + Extracts all the libraries (not relative imports) that are imported in a code. Args: - filename (`str` or `os.PathLike`): The module file to inspect. + code (`str`): Code text to inspect. Returns: - `List[str]`: The list of all packages required to use the input module. + `list[str]`: List of all packages required to use the input code. """ - with open(filename, "r", encoding="utf-8") as f: - content = f.read() - # filter out try/except block so in custom code we can have try/except imports - content = re.sub(r"\s*try\s*:.*?except.*?:", "", content, flags=re.DOTALL) + code = re.sub(r"\s*try\s*:.*?except.*?:", "", code, flags=re.DOTALL) # filter out imports under is_flash_attn_2_available block for avoid import issues in cpu only environment - content = re.sub( + code = re.sub( r"if is_flash_attn[a-zA-Z0-9_]+available\(\):\s*(from flash_attn\s*.*\s*)+", "", - content, + code, flags=re.MULTILINE, ) - # Imports of the form `import xxx` - imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE) + # Imports of the form `import xxx` or `import xxx as yyy` + imports = re.findall(r"^\s*import\s+(\S+?)(?:\s+as\s+\S+)?\s*$", code, flags=re.MULTILINE) # Imports of the form `from xxx import yyy` - imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE) + imports += re.findall(r"^\s*from\s+(\S+)\s+import", code, flags=re.MULTILINE) # Only keep the top-level module imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")] return list(set(imports)) @@ -232,7 +228,7 @@ def get_json_schema(func: Callable) -> Dict: args_split_re = re.compile( r""" (?:^|\n) # Match the start of the args block, or a newline -\s*(\w+):\s* # Capture the argument name and strip spacing +\s*(\w+)\s*(?:\([^)]*\))?:\s* # Capture the argument name (ignore the type) and strip spacing (.*?)\s* # Capture the argument description, which can span multiple lines, and strip trailing spacing (?=\n\s*\w+:|\Z) # Stop when you hit the next argument or the end of the block """, diff --git a/src/smolagents/types.py b/src/smolagents/agent_types.py similarity index 99% rename from src/smolagents/types.py rename to src/smolagents/agent_types.py index 7077daa59..b0d4ee1d1 100644 --- a/src/smolagents/types.py +++ b/src/smolagents/agent_types.py @@ -172,7 +172,7 @@ class AgentAudio(AgentType, str): """ def __init__(self, value, samplerate=16_000): - if not _is_package_available("soundfile") or not is_torch_available: + if not _is_package_available("soundfile") or not is_torch_available(): raise ModuleNotFoundError( "Please install 'audio' extra to use AgentAudio: `pip install 'smolagents[audio]'`" ) diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index b7111e824..a4d1b08f8 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -14,19 +14,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib import inspect +import json +import os +import re +import tempfile +import textwrap import time from collections import deque -from dataclasses import dataclass -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union - -from rich import box +from logging import getLogger +from pathlib import Path +from typing import Any, Callable, Dict, Generator, List, Optional, Set, Tuple, TypedDict, Union + +import jinja2 +import yaml +from huggingface_hub import create_repo, metadata_update, snapshot_download, upload_folder +from jinja2 import StrictUndefined, Template from rich.console import Group from rich.panel import Panel from rich.rule import Rule -from rich.syntax import Syntax from rich.text import Text +from .agent_types import AgentAudio, AgentImage, AgentType, handle_agent_output_types from .default_tools import TOOL_MAPPING, FinalAnswerTool from .e2b_executor import E2BExecutor from .local_python_executor import ( @@ -34,131 +44,125 @@ LocalPythonInterpreter, fix_final_answer_code, ) +from .memory import ActionStep, AgentMemory, PlanningStep, SystemPromptStep, TaskStep, ToolCall from .models import ( ChatMessage, MessageRole, + Model, ) -from .monitoring import Monitor -from .prompts import ( - CODE_SYSTEM_PROMPT, - MANAGED_AGENT_PROMPT, - PLAN_UPDATE_FINAL_PLAN_REDACTION, - SYSTEM_PROMPT_FACTS, - SYSTEM_PROMPT_FACTS_UPDATE, - SYSTEM_PROMPT_PLAN, - SYSTEM_PROMPT_PLAN_UPDATE, - TOOL_CALLING_SYSTEM_PROMPT, - USER_PROMPT_FACTS_UPDATE, - USER_PROMPT_PLAN, - USER_PROMPT_PLAN_UPDATE, -) -from .tools import ( - DEFAULT_TOOL_DESCRIPTION_TEMPLATE, - Tool, - get_tool_description_with_args, +from .monitoring import ( + YELLOW_HEX, + AgentLogger, + LogLevel, + Monitor, ) -from .types import AgentAudio, AgentImage, handle_agent_output_types +from .tools import Tool from .utils import ( AgentError, AgentExecutionError, AgentGenerationError, - AgentLogger, AgentMaxStepsError, AgentParsingError, - LogLevel, + make_init_file, parse_code_blobs, parse_json_tool_call, truncate_content, ) -@dataclass -class ToolCall: - name: str - arguments: Any - id: str +logger = getLogger(__name__) -class AgentStepLog: - pass +def get_variable_names(self, template: str) -> Set[str]: + pattern = re.compile(r"\{\{([^{}]+)\}\}") + return {match.group(1).strip() for match in pattern.finditer(template)} -@dataclass -class ActionStep(AgentStepLog): - agent_memory: List[Dict[str, str]] | None = None - tool_calls: List[ToolCall] | None = None - start_time: float | None = None - end_time: float | None = None - step_number: int | None = None - error: AgentError | None = None - duration: float | None = None - llm_output: str | None = None - observations: str | None = None - observations_images: List[str] | None = None - action_output: Any = None +def populate_template(template: str, variables: Dict[str, Any]) -> str: + compiled_template = Template(template, undefined=StrictUndefined) + try: + return compiled_template.render(**variables) + except Exception as e: + raise Exception(f"Error during jinja template rendering: {type(e).__name__}: {e}") -@dataclass -class PlanningStep(AgentStepLog): - plan: str - facts: str +class PlanningPromptTemplate(TypedDict): + """ + Prompt templates for the planning step. + + Args: + initial_facts (`str`): Initial facts prompt. + initial_plan (`str`): Initial plan prompt. + update_facts_pre_messages (`str`): Update facts pre-messages prompt. + update_facts_post_messages (`str`): Update facts post-messages prompt. + update_plan_pre_messages (`str`): Update plan pre-messages prompt. + update_plan_post_messages (`str`): Update plan post-messages prompt. + """ + + initial_facts: str + initial_plan: str + update_facts_pre_messages: str + update_facts_post_messages: str + update_plan_pre_messages: str + update_plan_post_messages: str -@dataclass -class TaskStep(AgentStepLog): +class ManagedAgentPromptTemplate(TypedDict): + """ + Prompt templates for the managed agent. + + Args: + task (`str`): Task prompt. + report (`str`): Report prompt. + """ + task: str - task_images: List[str] | None = None + report: str -@dataclass -class SystemPromptStep(AgentStepLog): - system_prompt: str +class FinalAnswerPromptTemplate(TypedDict): + """ + Prompt templates for the final answer. + Args: + pre_messages (`str`): Pre-messages prompt. + post_messages (`str`): Post-messages prompt. + """ -def get_tool_descriptions(tools: Dict[str, Tool], tool_description_template: str) -> str: - return "\n".join([get_tool_description_with_args(tool, tool_description_template) for tool in tools.values()]) + pre_messages: str + post_messages: str -def format_prompt_with_tools(tools: Dict[str, Tool], prompt_template: str, tool_description_template: str) -> str: - tool_descriptions = get_tool_descriptions(tools, tool_description_template) - prompt = prompt_template.replace("{{tool_descriptions}}", tool_descriptions) - if "{{tool_names}}" in prompt: - prompt = prompt.replace( - "{{tool_names}}", - ", ".join([f"'{tool.name}'" for tool in tools.values()]), - ) - return prompt - - -def show_agents_descriptions(managed_agents: Dict): - managed_agents_descriptions = """ -You can also give requests to team members. -Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'request', a long string explaining your request. -Given that this team member is a real human, you should be very verbose in your request. -Here is a list of the team members that you can call:""" - for agent in managed_agents.values(): - managed_agents_descriptions += f"\n- {agent.name}: {agent.description}" - return managed_agents_descriptions - - -def format_prompt_with_managed_agents_descriptions( - prompt_template, - managed_agents, - agent_descriptions_placeholder: Optional[str] = None, -) -> str: - if agent_descriptions_placeholder is None: - agent_descriptions_placeholder = "{{managed_agents_descriptions}}" - if agent_descriptions_placeholder not in prompt_template: - raise ValueError( - f"Provided prompt template does not contain the managed agents descriptions placeholder '{agent_descriptions_placeholder}'" - ) - if len(managed_agents.keys()) > 0: - return prompt_template.replace(agent_descriptions_placeholder, show_agents_descriptions(managed_agents)) - else: - return prompt_template.replace(agent_descriptions_placeholder, "") +class PromptTemplates(TypedDict): + """ + Prompt templates for the agent. + Args: + system_prompt (`str`): System prompt. + planning ([`~agents.PlanningPromptTemplate`]): Planning prompt templates. + managed_agent ([`~agents.ManagedAgentPromptTemplate`]): Managed agent prompt templates. + final_answer ([`~agents.FinalAnswerPromptTemplate`]): Final answer prompt templates. + """ -YELLOW_HEX = "#d4b702" + system_prompt: str + planning: PlanningPromptTemplate + managed_agent: ManagedAgentPromptTemplate + final_answer: FinalAnswerPromptTemplate + + +EMPTY_PROMPT_TEMPLATES = PromptTemplates( + system_prompt="", + planning=PlanningPromptTemplate( + initial_facts="", + initial_plan="", + update_facts_pre_messages="", + update_facts_post_messages="", + update_plan_pre_messages="", + update_plan_post_messages="", + ), + managed_agent=ManagedAgentPromptTemplate(task="", report=""), + final_answer=FinalAnswerPromptTemplate(pre_messages="", post_messages=""), +) class MultiStepAgent: @@ -169,215 +173,384 @@ class MultiStepAgent: Args: tools (`list[Tool]`): [`Tool`]s that the agent can use. model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions. - system_prompt (`str`, *optional*): System prompt that will be used to generate the agent's actions. - tool_description_template (`str`, *optional*): Template used to describe the tools in the system prompt. + prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates. max_steps (`int`, default `6`): Maximum number of steps the agent can take to solve the task. tool_parser (`Callable`, *optional*): Function used to parse the tool calls from the LLM output. add_base_tools (`bool`, default `False`): Whether to add the base tools to the agent's tools. - verbosity_level (`int`, default `1`): Level of verbosity of the agent's logs. + verbosity_level (`LogLevel`, default `LogLevel.INFO`): Level of verbosity of the agent's logs. grammar (`dict[str, str]`, *optional*): Grammar used to parse the LLM output. managed_agents (`list`, *optional*): Managed agents that the agent can call. step_callbacks (`list[Callable]`, *optional*): Callbacks that will be called at each step. planning_interval (`int`, *optional*): Interval at which the agent will run a planning step. + name (`str`, *optional*): Necessary for a managed agent only - the name by which this agent can be called. + description (`str`, *optional*): Necessary for a managed agent only - the description of this agent. + provide_run_summary (`bool`, *optional*): Whether to provide a run summary when called as a managed agent. + final_answer_checks (`list`, *optional*): List of Callables to run before returning a final answer for checking validity. """ def __init__( self, tools: List[Tool], model: Callable[[List[Dict[str, str]]], ChatMessage], - system_prompt: Optional[str] = None, - tool_description_template: Optional[str] = None, + prompt_templates: Optional[PromptTemplates] = None, max_steps: int = 6, tool_parser: Optional[Callable] = None, add_base_tools: bool = False, - verbosity_level: int = 1, + verbosity_level: LogLevel = LogLevel.INFO, grammar: Optional[Dict[str, str]] = None, managed_agents: Optional[List] = None, step_callbacks: Optional[List[Callable]] = None, planning_interval: Optional[int] = None, + name: Optional[str] = None, + description: Optional[str] = None, + provide_run_summary: bool = False, + final_answer_checks: Optional[List[Callable]] = None, ): - if system_prompt is None: - system_prompt = CODE_SYSTEM_PROMPT - if tool_parser is None: - tool_parser = parse_json_tool_call self.agent_name = self.__class__.__name__ self.model = model - self.system_prompt_template = system_prompt - self.tool_description_template = ( - tool_description_template if tool_description_template else DEFAULT_TOOL_DESCRIPTION_TEMPLATE - ) + self.prompt_templates = prompt_templates or EMPTY_PROMPT_TEMPLATES self.max_steps = max_steps - self.tool_parser = tool_parser + self.step_number = 0 + self.tool_parser = tool_parser or parse_json_tool_call self.grammar = grammar self.planning_interval = planning_interval self.state = {} + self.name = name + self.description = description + self.provide_run_summary = provide_run_summary + self.final_answer_checks = final_answer_checks - self.managed_agents = {} - if managed_agents is not None: - self.managed_agents = {agent.name: agent for agent in managed_agents} - - for tool in tools: - assert isinstance(tool, Tool), f"This element is not of class Tool: {str(tool)}" - self.tools = {tool.name: tool for tool in tools} - if add_base_tools: - for tool_name, tool_class in TOOL_MAPPING.items(): - if tool_name != "python_interpreter" or self.__class__.__name__ == "ToolCallingAgent": - self.tools[tool_name] = tool_class() - self.tools["final_answer"] = FinalAnswerTool() + self._setup_managed_agents(managed_agents) + self._setup_tools(tools, add_base_tools) + self._validate_tools_and_managed_agents(tools, managed_agents) self.system_prompt = self.initialize_system_prompt() self.input_messages = None - self.logs = [] self.task = None + self.memory = AgentMemory(self.system_prompt) self.logger = AgentLogger(level=verbosity_level) self.monitor = Monitor(self.model, self.logger) self.step_callbacks = step_callbacks if step_callbacks is not None else [] self.step_callbacks.append(self.monitor.update_metrics) - def initialize_system_prompt(self): - self.system_prompt = format_prompt_with_tools( - self.tools, - self.system_prompt_template, - self.tool_description_template, - ) - self.system_prompt = format_prompt_with_managed_agents_descriptions(self.system_prompt, self.managed_agents) + def _setup_managed_agents(self, managed_agents): + self.managed_agents = {} + if managed_agents: + assert all(agent.name and agent.description for agent in managed_agents), ( + "All managed agents need both a name and a description!" + ) + self.managed_agents = {agent.name: agent for agent in managed_agents} - return self.system_prompt + def _setup_tools(self, tools, add_base_tools): + assert all(isinstance(tool, Tool) for tool in tools), "All elements must be instance of Tool (or a subclass)" + self.tools = {tool.name: tool for tool in tools} + if add_base_tools: + self.tools.update( + { + name: cls() + for name, cls in TOOL_MAPPING.items() + if name != "python_interpreter" or self.__class__.__name__ == "ToolCallingAgent" + } + ) + self.tools["final_answer"] = FinalAnswerTool() - def write_inner_memory_from_logs(self, summary_mode: bool = False) -> List[Dict[str, str]]: + def _validate_tools_and_managed_agents(self, tools, managed_agents): + tool_and_managed_agent_names = [tool.name for tool in tools] + if managed_agents is not None: + for agent in managed_agents: + tool_and_managed_agent_names.append(agent.name) + for tool in agent.tools.values(): + if tool.name != "final_answer": + tool_and_managed_agent_names.append(tool.name) + if len(tool_and_managed_agent_names) != len(set(tool_and_managed_agent_names)): + raise ValueError( + "Each tool or managed_agent should have a unique name! You passed these duplicate names: " + f"{[name for name in tool_and_managed_agent_names if tool_and_managed_agent_names.count(name) > 1]}" + ) + + def run( + self, + task: str, + stream: bool = False, + reset: bool = True, + images: Optional[List[str]] = None, + additional_args: Optional[Dict] = None, + ): """ - Reads past llm_outputs, actions, and observations or errors from the logs into a series of messages - that can be used as input to the LLM. + Run the agent for the given task. Args: - summary_mode (`bool`): Whether to write a summary of the logs or the full logs. + task (`str`): Task to perform. + stream (`bool`): Whether to run in a streaming way. + reset (`bool`): Whether to reset the conversation or keep it going from previous run. + images (`list[str]`, *optional*): Paths to image(s). + additional_args (`dict`): Any other variables that you want to pass to the agent run, for instance images or dataframes. Give them clear names! + + Example: + ```py + from smolagents import CodeAgent + agent = CodeAgent(tools=[]) + agent.run("What is the result of 2 power 3.7384?") + ``` """ - memory = [] - for i, step_log in enumerate(self.logs): - if isinstance(step_log, SystemPromptStep): - if not summary_mode: - thought_message = { - "role": MessageRole.SYSTEM, - "content": [{"type": "text", "text": step_log.system_prompt.strip()}], - } - memory.append(thought_message) - elif isinstance(step_log, PlanningStep): - thought_message = { - "role": MessageRole.ASSISTANT, - "content": "[FACTS LIST]:\n" + step_log.facts.strip(), - } - memory.append(thought_message) + self.task = task + if additional_args is not None: + self.state.update(additional_args) + self.task += f""" +You have been provided with these additional arguments, that you can access using the keys as variables in your python code: +{str(additional_args)}.""" - if not summary_mode: - thought_message = { - "role": MessageRole.ASSISTANT, - "content": "[PLAN]:\n" + step_log.plan.strip(), + self.system_prompt = self.initialize_system_prompt() + self.memory.system_prompt = SystemPromptStep(system_prompt=self.system_prompt) + if reset: + self.memory.reset() + self.monitor.reset() + + self.logger.log_task( + content=self.task.strip(), + subtitle=f"{type(self.model).__name__} - {(self.model.model_id if hasattr(self.model, 'model_id') else '')}", + level=LogLevel.INFO, + title=self.name if hasattr(self, "name") else None, + ) + + self.memory.steps.append(TaskStep(task=self.task, task_images=images)) + + if stream: + # The steps are returned as they are executed through a generator to iterate on. + return self._run(task=self.task, images=images) + # Outputs are returned only at the end as a string. We only look at the last step + return deque(self._run(task=self.task, images=images), maxlen=1)[0] + + def _run(self, task: str, images: List[str] | None = None) -> Generator[ActionStep | AgentType, None, None]: + final_answer = None + self.step_number = 1 + while final_answer is None and self.step_number <= self.max_steps: + step_start_time = time.time() + memory_step = self._create_memory_step(step_start_time, images) + try: + final_answer = self._execute_step(task, memory_step) + except AgentError as e: + memory_step.error = e + finally: + self._finalize_step(memory_step, step_start_time) + yield memory_step + self.step_number += 1 + + if final_answer is None and self.step_number == self.max_steps + 1: + final_answer = self._handle_max_steps_reached(task, images, step_start_time) + yield memory_step + yield handle_agent_output_types(final_answer) + + def _create_memory_step(self, step_start_time: float, images: List[str] | None) -> ActionStep: + return ActionStep(step_number=self.step_number, start_time=step_start_time, observations_images=images) + + def _execute_step(self, task: str, memory_step: ActionStep) -> Union[None, Any]: + if self.planning_interval is not None and self.step_number % self.planning_interval == 1: + self.planning_step(task, is_first_step=(self.step_number == 1), step=self.step_number) + self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO) + final_answer = self.step(memory_step) + if final_answer is not None and self.final_answer_checks: + self._validate_final_answer(final_answer) + return final_answer + + def _validate_final_answer(self, final_answer: Any): + for check_function in self.final_answer_checks: + try: + assert check_function(final_answer, self.memory) + except Exception as e: + raise AgentError(f"Check {check_function.__name__} failed with error: {e}", self.logger) + + def _finalize_step(self, memory_step: ActionStep, step_start_time: float): + memory_step.end_time = time.time() + memory_step.duration = memory_step.end_time - step_start_time + self.memory.steps.append(memory_step) + for callback in self.step_callbacks: + # For compatibility with old callbacks that don't take the agent as an argument + callback(memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( + memory_step, agent=self + ) + + def _handle_max_steps_reached(self, task: str, images: List[str], step_start_time: float) -> Any: + final_answer = self.provide_final_answer(task, images) + final_memory_step = ActionStep( + step_number=self.step_number, error=AgentMaxStepsError("Reached max steps.", self.logger) + ) + final_memory_step.action_output = final_answer + final_memory_step.end_time = time.time() + final_memory_step.duration = final_memory_step.end_time - step_start_time + self.memory.steps.append(final_memory_step) + for callback in self.step_callbacks: + callback(final_memory_step) if len(inspect.signature(callback).parameters) == 1 else callback( + final_memory_step, agent=self + ) + return final_answer + + def planning_step(self, task, is_first_step: bool, step: int) -> None: + input_messages, facts_message, plan_message = ( + self._generate_initial_plan(task) if is_first_step else self._generate_updated_plan(task, step) + ) + self._record_planning_step(input_messages, facts_message, plan_message, is_first_step) + + def _generate_initial_plan(self, task: str) -> Tuple[ChatMessage, ChatMessage]: + input_messages = [ + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": populate_template( + self.prompt_templates["planning"]["initial_facts"], variables={"task": task} + ), } - memory.append(thought_message) + ], + }, + ] + facts_message = self.model(input_messages) - elif isinstance(step_log, TaskStep): - task_message = { - "role": MessageRole.USER, - "content": [{"type": "text", "text": f"New task:\n{step_log.task}"}], + message_prompt_plan = { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": populate_template( + self.prompt_templates["planning"]["initial_plan"], + variables={ + "task": task, + "tools": self.tools, + "managed_agents": self.managed_agents, + "answer_facts": facts_message.content, + }, + ), } - if step_log.task_images: - for image in step_log.task_images: - task_message["content"].append({"type": "image", "image": image}) - memory.append(task_message) - - elif isinstance(step_log, ActionStep): - if step_log.llm_output is not None and not summary_mode: - thought_message = { - "role": MessageRole.ASSISTANT, - "content": [{"type": "text", "text": step_log.llm_output.strip()}], - } - memory.append(thought_message) - if step_log.tool_calls is not None: - tool_call_message = { - "role": MessageRole.ASSISTANT, - "content": [ - { - "type": "text", - "text": str( - [ - { - "id": tool_call.id, - "type": "function", - "function": { - "name": tool_call.name, - "arguments": tool_call.arguments, - }, - } - for tool_call in step_log.tool_calls - ] - ), - } - ], - } - memory.append(tool_call_message) - if step_log.error is not None: - error_message = { - "role": MessageRole.ASSISTANT, - "content": [ - { - "type": "text", - "text": ( - "Error:\n" - + str(step_log.error) - + "\nNow let's retry: take care not to repeat previous errors! If you have retried several times, try a completely different approach.\n" - ), - } - ], - } - memory.append(error_message) - if step_log.observations is not None: - if step_log.tool_calls: - tool_call_reference = f"Call id: {(step_log.tool_calls[0].id if getattr(step_log.tool_calls[0], 'id') else 'call_0')}\n" - else: - tool_call_reference = "" - text_observations = f"Observation:\n{step_log.observations}" - tool_response_message = { - "role": MessageRole.TOOL_RESPONSE, - "content": [{"type": "text", "text": tool_call_reference + text_observations}], - } - memory.append(tool_response_message) - if step_log.observations_images: - thought_message_image = { - "role": MessageRole.USER, - "content": [{"type": "text", "text": "Here are the observed images:"}] - + [ - { - "type": "image", - "image": image, - } - for image in step_log.observations_images - ], - } - memory.append(thought_message_image) + ], + } + plan_message = self.model([message_prompt_plan], stop_sequences=[""]) + return input_messages, facts_message, plan_message + + def _generate_updated_plan(self, task: str, step: int) -> Tuple[ChatMessage, ChatMessage]: + # Do not take the system prompt message from the memory + # summary_mode=False: Do not take previous plan steps to avoid influencing the new plan + memory_messages = self.write_memory_to_messages()[1:] + facts_update_pre = { + "role": MessageRole.SYSTEM, + "content": [{"type": "text", "text": self.prompt_templates["planning"]["update_facts_pre_messages"]}], + } + facts_update_post = { + "role": MessageRole.USER, + "content": [{"type": "text", "text": self.prompt_templates["planning"]["update_facts_post_messages"]}], + } + input_messages = [facts_update_pre] + memory_messages + [facts_update_post] + facts_message = self.model(input_messages) + + update_plan_pre = { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": populate_template( + self.prompt_templates["planning"]["update_plan_pre_messages"], variables={"task": task} + ), + } + ], + } + update_plan_post = { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": populate_template( + self.prompt_templates["planning"]["update_plan_post_messages"], + variables={ + "task": task, + "tools": self.tools, + "managed_agents": self.managed_agents, + "facts_update": facts_message.content, + "remaining_steps": (self.max_steps - step), + }, + ), + } + ], + } + plan_message = self.model( + [update_plan_pre] + memory_messages + [update_plan_post], stop_sequences=[""] + ) + return input_messages, facts_message, plan_message - return memory + def _record_planning_step( + self, input_messages: list, facts_message: ChatMessage, plan_message: ChatMessage, is_first_step: bool + ) -> None: + if is_first_step: + facts = textwrap.dedent(f"""Here are the facts that I know so far:\n```\n{facts_message.content}\n```""") + plan = textwrap.dedent( + f"""Here is the plan of action that I will follow to solve the task:\n```\n{plan_message.content}\n```""" + ) + log_message = "Initial plan" + else: + facts = textwrap.dedent( + f"""Here is the updated list of the facts that I know:\n```\n{facts_message.content}\n```""" + ) + plan = textwrap.dedent( + f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere is my new/updated plan of action to solve the task:\n```\n{plan_message.content}\n```""" + ) + log_message = "Updated plan" + self.memory.steps.append( + PlanningStep( + model_input_messages=input_messages, + facts=facts, + plan=plan, + model_output_message_plan=plan_message, + model_output_message_facts=facts_message, + ) + ) + self.logger.log(Rule(f"[bold]{log_message}", style="orange"), Text(plan), level=LogLevel.INFO) - def get_succinct_logs(self): - return [{key: value for key, value in log.items() if key != "agent_memory"} for log in self.logs] + @property + def logs(self): + logger.warning( + "The 'logs' attribute is deprecated and will soon be removed. Please use 'self.memory.steps' instead." + ) + return [self.memory.system_prompt] + self.memory.steps - def extract_action(self, llm_output: str, split_token: str) -> Tuple[str, str]: + def initialize_system_prompt(self): + """To be implemented in child classes""" + pass + + def write_memory_to_messages( + self, + summary_mode: Optional[bool] = False, + ) -> List[Dict[str, str]]: + """ + Reads past llm_outputs, actions, and observations or errors from the memory into a series of messages + that can be used as input to the LLM. Adds a number of keywords (such as PLAN, error, etc) to help + the LLM. + """ + messages = self.memory.system_prompt.to_messages(summary_mode=summary_mode) + for memory_step in self.memory.steps: + messages.extend(memory_step.to_messages(summary_mode=summary_mode)) + return messages + + def visualize(self): + """Creates a rich tree visualization of the agent's structure.""" + self.logger.visualize_agent_tree(self) + + def extract_action(self, model_output: str, split_token: str) -> Tuple[str, str]: """ Parse action from the LLM output Args: - llm_output (`str`): Output of the LLM + model_output (`str`): Output of the LLM split_token (`str`): Separator for the action. Should match the example in the system prompt. """ try: - split = llm_output.split(split_token) + split = model_output.split(split_token) rationale, action = ( split[-2], split[-1], ) # NOTE: using indexes starting from the end solves for when you have more than one split_token in the output except Exception: raise AgentParsingError( - f"No '{split_token}' token provided in your output.\nYour output:\n{llm_output}\n. Be sure to include an action, prefaced with '{split_token}'!", + f"No '{split_token}' token provided in your output.\nYour output:\n{model_output}\n. Be sure to include an action, prefaced with '{split_token}'!", self.logger, ) return rationale.strip(), action.strip() @@ -393,47 +566,36 @@ def provide_final_answer(self, task: str, images: Optional[list[str]]) -> str: Returns: `str`: Final answer to the task. """ + messages = [ + { + "role": MessageRole.SYSTEM, + "content": [ + { + "type": "text", + "text": self.prompt_templates["final_answer"]["pre_messages"], + } + ], + } + ] if images: - self.input_messages[0]["content"] = [ - { - "type": "text", - "text": "An agent tried to answer a user query but it got stuck and failed to do so. You are tasked with providing an answer instead. Here is the agent's memory:", - } - ] - self.input_messages[0]["content"].append({"type": "image"}) - self.input_messages += self.write_inner_memory_from_logs()[1:] - self.input_messages += [ - { - "role": MessageRole.USER, - "content": [ - { - "type": "text", - "text": f"Based on the above, please provide an answer to the following user request:\n{task}", - } - ], - } - ] - else: - self.input_messages[0]["content"] = [ - { - "type": "text", - "text": "An agent tried to answer a user query but it got stuck and failed to do so. You are tasked with providing an answer instead. Here is the agent's memory:", - } - ] - self.input_messages += self.write_inner_memory_from_logs()[1:] - self.input_messages += [ - { - "role": MessageRole.USER, - "content": [ - { - "type": "text", - "text": f"Based on the above, please provide an answer to the following user request:\n{task}", - } - ], - } - ] + messages[0]["content"].append({"type": "image"}) + messages += self.write_memory_to_messages()[1:] + messages += [ + { + "role": MessageRole.USER, + "content": [ + { + "type": "text", + "text": populate_template( + self.prompt_templates["final_answer"]["post_messages"], variables={"task": task} + ), + } + ], + } + ] try: - return self.model(self.input_messages).content + chat_message: ChatMessage = self.model(messages) + return chat_message.content except Exception as e: return f"Error in generating final LLM output:\n{e}" @@ -471,10 +633,10 @@ def execute_tool_call(self, tool_name: str, arguments: Union[Dict[str, str], str return observation except Exception as e: if tool_name in self.tools: - tool_description = get_tool_description_with_args(available_tools[tool_name]) + tool = self.tools[tool_name] error_msg = ( - f"Error in tool call execution: {e}\nYou should only use this tool with a correct input.\n" - f"As a reminder, this tool's description is the following:\n{tool_description}" + f"Error when executing tool {tool_name} with arguments {arguments}: {type(e).__name__}: {e}\nYou should only use this tool with a correct input.\n" + f"As a reminder, this tool's description is the following: '{tool.description}'.\nIt takes inputs: {tool.inputs} and returns output type {tool.output_type}" ) raise AgentExecutionError(error_msg, self.logger) elif tool_name in self.managed_agents: @@ -484,261 +646,351 @@ def execute_tool_call(self, tool_name: str, arguments: Union[Dict[str, str], str ) raise AgentExecutionError(error_msg, self.logger) - def step(self, log_entry: ActionStep) -> Union[None, Any]: + def step(self, memory_step: ActionStep) -> Union[None, Any]: """To be implemented in children classes. Should return either None if the step is not final.""" pass - def run( - self, - task: str, - stream: bool = False, - reset: bool = True, - single_step: bool = False, - images: Optional[List[str]] = None, - additional_args: Optional[Dict] = None, - ): - """ - Run the agent for the given task. + def replay(self, detailed: bool = False): + """Prints a pretty replay of the agent's steps. Args: - task (`str`): Task to perform. - stream (`bool`): Whether to run in a streaming way. - reset (`bool`): Whether to reset the conversation or keep it going from previous run. - single_step (`bool`): Whether to run the agent in one-shot fashion. - images (`list[str]`, *optional*): Paths to image(s). - additional_args (`dict`): Any other variables that you want to pass to the agent run, for instance images or dataframes. Give them clear names! + detailed (bool, optional): If True, also displays the memory at each step. Defaults to False. + Careful: will increase log length exponentially. Use only for debugging. + """ + self.memory.replay(self.logger, detailed=detailed) - Example: - ```py - from smolagents import CodeAgent - agent = CodeAgent(tools=[]) - agent.run("What is the result of 2 power 3.7384?") - ``` + def __call__(self, task: str, **kwargs): + """Adds additional prompting for the managed agent, runs it, and wraps the output. + + This method is called only by a managed agent. """ + full_task = populate_template( + self.prompt_templates["managed_agent"]["task"], + variables=dict(name=self.name, task=task), + ) + report = self.run(full_task, **kwargs) + answer = populate_template( + self.prompt_templates["managed_agent"]["report"], variables=dict(name=self.name, final_answer=report) + ) + if self.provide_run_summary: + answer += "\n\nFor more detail, find below a summary of this agent's work:\n\n" + for message in self.write_memory_to_messages(summary_mode=True): + content = message["content"] + answer += "\n" + truncate_content(str(content)) + "\n---" + answer += "\n" + return answer - self.task = task - if additional_args is not None: - self.state.update(additional_args) - self.task += f""" -You have been provided with these additional arguments, that you can access using the keys as variables in your python code: -{str(additional_args)}.""" + def save(self, output_dir: str, relative_path: Optional[str] = None): + """ + Saves the relevant code files for your agent. This will copy the code of your agent in `output_dir` as well as autogenerate: - self.initialize_system_prompt() - system_prompt_step = SystemPromptStep(system_prompt=self.system_prompt) + - a `tools` folder containing the logic for each of the tools under `tools/{tool_name}.py`. + - a `managed_agents` folder containing the logic for each of the managed agents. + - an `agent.json` file containing a dictionary representing your agent. + - a `prompt.yaml` file containing the prompt templates used by your agent. + - an `app.py` file providing a UI for your agent when it is exported to a Space with `agent.push_to_hub()` + - a `requirements.txt` containing the names of the modules used by your tool (as detected when inspecting its + code) - if reset: - self.logs = [] - self.logs.append(system_prompt_step) - self.monitor.reset() - else: - if len(self.logs) > 0: - self.logs[0] = system_prompt_step - else: - self.logs.append(system_prompt_step) + Args: + output_dir (`str`): The folder in which you want to save your tool. + """ + make_init_file(output_dir) + + # Recursively save managed agents + if self.managed_agents: + make_init_file(os.path.join(output_dir, "managed_agents")) + for agent_name, agent in self.managed_agents.items(): + agent_suffix = f"managed_agents.{agent_name}" + if relative_path: + agent_suffix = relative_path + "." + agent_suffix + agent.save(os.path.join(output_dir, "managed_agents", agent_name), relative_path=agent_suffix) + + class_name = self.__class__.__name__ + + # Save tools to different .py files + for tool in self.tools.values(): + make_init_file(os.path.join(output_dir, "tools")) + tool.save(os.path.join(output_dir, "tools"), tool_file_name=tool.name, make_gradio_app=False) + + # Save prompts to yaml + yaml_prompts = yaml.safe_dump( + self.prompt_templates, + default_style="|", # This forces block literals for all strings + default_flow_style=False, + width=float("inf"), + sort_keys=False, + allow_unicode=True, + indent=2, + ) - self.logger.log( - Panel( - f"\n[bold]{self.task.strip()}\n", - title="[bold]New run", - subtitle=f"{type(self.model).__name__} - {(self.model.model_id if hasattr(self.model, 'model_id') else '')}", - border_style=YELLOW_HEX, - subtitle_align="left", - ), - level=LogLevel.INFO, + with open(os.path.join(output_dir, "prompts.yaml"), "w", encoding="utf-8") as f: + f.write(yaml_prompts) + + # Save agent dictionary to json + agent_dict = self.to_dict() + agent_dict["tools"] = [tool.name for tool in self.tools.values()] + with open(os.path.join(output_dir, "agent.json"), "w", encoding="utf-8") as f: + json.dump(agent_dict, f, indent=4) + + # Save requirements + with open(os.path.join(output_dir, "requirements.txt"), "w", encoding="utf-8") as f: + f.writelines(f"{r}\n" for r in agent_dict["requirements"]) + + # Make agent.py file with Gradio UI + agent_name = f"agent_{self.name}" if getattr(self, "name", None) else "agent" + managed_agent_relative_path = relative_path + "." if relative_path is not None else "" + app_template = textwrap.dedent(""" + import yaml + import os + from smolagents import GradioUI, {{ class_name }}, {{ agent_dict['model']['class'] }} + + # Get current directory path + CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) + + {% for tool in tools.values() -%} + from {{managed_agent_relative_path}}tools.{{ tool.name }} import {{ tool.__class__.__name__ }} as {{ tool.name | camelcase }} + {% endfor %} + {% for managed_agent in managed_agents.values() -%} + from {{managed_agent_relative_path}}managed_agents.{{ managed_agent.name }}.app import agent_{{ managed_agent.name }} + {% endfor %} + + model = {{ agent_dict['model']['class'] }}( + {% for key in agent_dict['model']['data'] if key not in ['class', 'last_input_token_count', 'last_output_token_count'] -%} + {{ key }}={{ agent_dict['model']['data'][key]|repr }}, + {% endfor %}) + + {% for tool in tools.values() -%} + {{ tool.name }} = {{ tool.name | camelcase }}() + {% endfor %} + + with open(os.path.join(CURRENT_DIR, "prompts.yaml"), 'r') as stream: + prompt_templates = yaml.safe_load(stream) + + {{ agent_name }} = {{ class_name }}( + model=model, + tools=[{% for tool_name in tools.keys() if tool_name != "final_answer" %}{{ tool_name }}{% if not loop.last %}, {% endif %}{% endfor %}], + managed_agents=[{% for subagent_name in managed_agents.keys() %}agent_{{ subagent_name }}{% if not loop.last %}, {% endif %}{% endfor %}], + {% for attribute_name, value in agent_dict.items() if attribute_name not in ["model", "tools", "prompt_templates", "authorized_imports", "managed_agents", "requirements"] -%} + {{ attribute_name }}={{ value|repr }}, + {% endfor %}prompt_templates=prompt_templates + ) + if __name__ == "__main__": + GradioUI({{ agent_name }}).launch() + """).strip() + template_env = jinja2.Environment(loader=jinja2.BaseLoader(), undefined=jinja2.StrictUndefined) + template_env.filters["repr"] = repr + template_env.filters["camelcase"] = lambda value: "".join(word.capitalize() for word in value.split("_")) + template = template_env.from_string(app_template) + + # Render the app.py file from Jinja2 template + app_text = template.render( + { + "agent_name": agent_name, + "class_name": class_name, + "agent_dict": agent_dict, + "tools": self.tools, + "managed_agents": self.managed_agents, + "managed_agent_relative_path": managed_agent_relative_path, + } ) - self.logs.append(TaskStep(task=self.task, task_images=images)) - if single_step: - step_start_time = time.time() - step_log = ActionStep(start_time=step_start_time, observations_images=images) - step_log.end_time = time.time() - step_log.duration = step_log.end_time - step_start_time + with open(os.path.join(output_dir, "app.py"), "w", encoding="utf-8") as f: + f.write(app_text + "\n") # Append newline at the end + + def to_dict(self) -> Dict[str, Any]: + """Converts agent into a dictionary.""" + # TODO: handle serializing step_callbacks and final_answer_checks + for attr in ["final_answer_checks", "step_callbacks"]: + if getattr(self, attr, None): + self.logger.log(f"This agent has {attr}: they will be ignored by this method.", LogLevel.INFO) + + tool_dicts = [tool.to_dict() for tool in self.tools.values()] + tool_requirements = {req for tool in self.tools.values() for req in tool.to_dict()["requirements"]} + managed_agents_requirements = { + req for managed_agent in self.managed_agents.values() for req in managed_agent.to_dict()["requirements"] + } + requirements = tool_requirements | managed_agents_requirements + if hasattr(self, "authorized_imports"): + requirements.update( + {package.split(".")[0] for package in self.authorized_imports if package not in BASE_BUILTIN_MODULES} + ) - # Run the agent's step - result = self.step(step_log) - return result + agent_dict = { + "tools": tool_dicts, + "model": { + "class": self.model.__class__.__name__, + "data": self.model.to_dict(), + }, + "managed_agents": { + managed_agent.name: managed_agent.__class__.__name__ for managed_agent in self.managed_agents.values() + }, + "prompt_templates": self.prompt_templates, + "max_steps": self.max_steps, + "verbosity_level": int(self.logger.level), + "grammar": self.grammar, + "planning_interval": self.planning_interval, + "name": self.name, + "description": self.description, + "requirements": list(requirements), + } + if hasattr(self, "authorized_imports"): + agent_dict["authorized_imports"] = self.authorized_imports + if hasattr(self, "use_e2b_executor"): + agent_dict["use_e2b_executor"] = self.use_e2b_executor + if hasattr(self, "max_print_outputs_length"): + agent_dict["max_print_outputs_length"] = self.max_print_outputs_length + return agent_dict + + @classmethod + def from_hub( + cls, + repo_id: str, + token: Optional[str] = None, + trust_remote_code: bool = False, + **kwargs, + ): + """ + Loads an agent defined on the Hub. - if stream: - # The steps are returned as they are executed through a generator to iterate on. - return self._run(task=self.task, images=images) - # Outputs are returned only at the end as a string. We only look at the last step - return deque(self._run(task=self.task, images=images), maxlen=1)[0] + - def _run(self, task: str, images: List[str] | None = None) -> Generator[str, None, None]: - """ - Run the agent in streaming mode and returns a generator of all the steps. + Loading a tool from the Hub means that you'll download the tool and execute it locally. + ALWAYS inspect the tool you're downloading before loading it within your runtime, as you would do when + installing a package using pip/npm/apt. + + Args: - task (`str`): Task to perform. - images (`list[str]`): Paths to image(s). + repo_id (`str`): + The name of the repo on the Hub where your tool is defined. + token (`str`, *optional*): + The token to identify you on hf.co. If unset, will use the token generated when running + `huggingface-cli login` (stored in `~/.huggingface`). + trust_remote_code(`bool`, *optional*, defaults to False): + This flags marks that you understand the risk of running remote code and that you trust this tool. + If not setting this to True, loading the tool from Hub will fail. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments that will be split in two: all arguments relevant to the Hub (such as + `cache_dir`, `revision`, `subfolder`) will be used when downloading the files for your agent, and the + others will be passed along to its init. """ - final_answer = None - self.step_number = 0 - while final_answer is None and self.step_number < self.max_steps: - step_start_time = time.time() - step_log = ActionStep( - step_number=self.step_number, - start_time=step_start_time, - observations_images=images, + if not trust_remote_code: + raise ValueError( + "Loading an agent from Hub requires to acknowledge you trust its code: to do so, pass `trust_remote_code=True`." ) - try: - if self.planning_interval is not None and self.step_number % self.planning_interval == 0: - self.planning_step( - task, - is_first_step=(self.step_number == 0), - step=self.step_number, - ) - self.logger.log( - Rule( - f"[bold]Step {self.step_number}", - characters="━", - style=YELLOW_HEX, - ), - level=LogLevel.INFO, - ) - - # Run one step! - final_answer = self.step(step_log) - except AgentError as e: - step_log.error = e - finally: - step_log.end_time = time.time() - step_log.duration = step_log.end_time - step_start_time - self.logs.append(step_log) - for callback in self.step_callbacks: - # For compatibility with old callbacks that don't take the agent as an argument - if len(inspect.signature(callback).parameters) == 1: - callback(step_log) - else: - callback(step_log=step_log, agent=self) - self.step_number += 1 - yield step_log - if final_answer is None and self.step_number == self.max_steps: - error_message = "Reached max steps." - final_step_log = ActionStep( - step_number=self.step_number, error=AgentMaxStepsError(error_message, self.logger) - ) - self.logs.append(final_step_log) - final_answer = self.provide_final_answer(task, images) - self.logger.log(Text(f"Final answer: {final_answer}"), level=LogLevel.INFO) - final_step_log.action_output = final_answer - final_step_log.end_time = time.time() - final_step_log.duration = step_log.end_time - step_start_time - for callback in self.step_callbacks: - # For compatibility with old callbacks that don't take the agent as an argument - if len(inspect.signature(callback).parameters) == 1: - callback(final_step_log) - else: - callback(step_log=final_step_log, agent=self) - yield final_step_log + # Get the agent's Hub folder. + download_kwargs = {"token": token, "repo_type": "space"} | { + key: kwargs.pop(key) + for key in [ + "cache_dir", + "force_download", + "proxies", + "revision", + "local_files_only", + ] + if key in kwargs + } - yield handle_agent_output_types(final_answer) + download_folder = Path(snapshot_download(repo_id=repo_id, **download_kwargs)) + return cls.from_folder(download_folder, **kwargs) - def planning_step(self, task, is_first_step: bool, step: int) -> None: - """ - Used periodically by the agent to plan the next steps to reach the objective. + @classmethod + def from_folder(cls, folder: Union[str, Path], **kwargs): + """Loads an agent from a local folder. Args: - task (`str`): Task to perform. - is_first_step (`bool`): If this step is not the first one, the plan should be an update over a previous plan. - step (`int`): The number of the current step, used as an indication for the LLM. + folder (`str` or `Path`): The folder where the agent is saved. + **kwargs: Additional keyword arguments that will be passed to the agent's init. """ - if is_first_step: - message_prompt_facts = { - "role": MessageRole.SYSTEM, - "content": SYSTEM_PROMPT_FACTS, - } - message_prompt_task = { - "role": MessageRole.USER, - "content": f"""Here is the task: -``` -{task} -``` -Now begin!""", - } + folder = Path(folder) + agent_dict = json.loads((folder / "agent.json").read_text()) - answer_facts = self.model([message_prompt_facts, message_prompt_task]).content + # Recursively get managed agents + managed_agents = [] + for managed_agent_name, managed_agent_class in agent_dict["managed_agents"].items(): + agent_cls = getattr(importlib.import_module("smolagents.agents"), managed_agent_class) + managed_agents.append(agent_cls.from_folder(folder / "managed_agents" / managed_agent_name)) - message_system_prompt_plan = { - "role": MessageRole.SYSTEM, - "content": SYSTEM_PROMPT_PLAN, - } - message_user_prompt_plan = { - "role": MessageRole.USER, - "content": USER_PROMPT_PLAN.format( - task=task, - tool_descriptions=get_tool_descriptions(self.tools, self.tool_description_template), - managed_agents_descriptions=(show_agents_descriptions(self.managed_agents)), - answer_facts=answer_facts, - ), - } - answer_plan = self.model( - [message_system_prompt_plan, message_user_prompt_plan], - stop_sequences=[""], - ).content - - final_plan_redaction = f"""Here is the plan of action that I will follow to solve the task: -``` -{answer_plan} -```""" - final_facts_redaction = f"""Here are the facts that I know so far: -``` -{answer_facts} -```""".strip() - self.logs.append(PlanningStep(plan=final_plan_redaction, facts=final_facts_redaction)) - self.logger.log( - Rule("[bold]Initial plan", style="orange"), - Text(final_plan_redaction), - level=LogLevel.INFO, - ) - else: # update plan - agent_memory = self.write_inner_memory_from_logs( - summary_mode=False - ) # This will not log the plan but will log facts + tools = [] + for tool_name in agent_dict["tools"]: + tool_code = (folder / "tools" / f"{tool_name}.py").read_text() + tools.append(Tool.from_code(tool_code)) - # Redact updated facts - facts_update_system_prompt = { - "role": MessageRole.SYSTEM, - "content": SYSTEM_PROMPT_FACTS_UPDATE, - } - facts_update_message = { - "role": MessageRole.USER, - "content": USER_PROMPT_FACTS_UPDATE, - } - facts_update = self.model([facts_update_system_prompt] + agent_memory + [facts_update_message]).content + model_class: Model = getattr(importlib.import_module("smolagents.models"), agent_dict["model"]["class"]) + model = model_class.from_dict(agent_dict["model"]["data"]) - # Redact updated plan - plan_update_message = { - "role": MessageRole.SYSTEM, - "content": SYSTEM_PROMPT_PLAN_UPDATE.format(task=task), - } - plan_update_message_user = { - "role": MessageRole.USER, - "content": USER_PROMPT_PLAN_UPDATE.format( - task=task, - tool_descriptions=get_tool_descriptions(self.tools, self.tool_description_template), - managed_agents_descriptions=(show_agents_descriptions(self.managed_agents)), - facts_update=facts_update, - remaining_steps=(self.max_steps - step), - ), - } - plan_update = self.model( - [plan_update_message] + agent_memory + [plan_update_message_user], - stop_sequences=[""], - ).content - - # Log final facts and plan - final_plan_redaction = PLAN_UPDATE_FINAL_PLAN_REDACTION.format(task=task, plan_update=plan_update) - final_facts_redaction = f"""Here is the updated list of the facts that I know: -``` -{facts_update} -```""" - self.logs.append(PlanningStep(plan=final_plan_redaction, facts=final_facts_redaction)) - self.logger.log( - Rule("[bold]Updated plan", style="orange"), - Text(final_plan_redaction), - level=LogLevel.INFO, + args = dict( + model=model, + tools=tools, + managed_agents=managed_agents, + name=agent_dict["name"], + description=agent_dict["description"], + max_steps=agent_dict["max_steps"], + planning_interval=agent_dict["planning_interval"], + grammar=agent_dict["grammar"], + verbosity_level=agent_dict["verbosity_level"], + ) + if cls.__name__ == "CodeAgent": + args["additional_authorized_imports"] = agent_dict["authorized_imports"] + args["use_e2b_executor"] = agent_dict["use_e2b_executor"] + args["max_print_outputs_length"] = agent_dict["max_print_outputs_length"] + args.update(kwargs) + return cls(**args) + + def push_to_hub( + self, + repo_id: str, + commit_message: str = "Upload agent", + private: Optional[bool] = None, + token: Optional[Union[bool, str]] = None, + create_pr: bool = False, + ) -> str: + """ + Upload the agent to the Hub. + + Parameters: + repo_id (`str`): + The name of the repository you want to push to. It should contain your organization name when + pushing to a given organization. + commit_message (`str`, *optional*, defaults to `"Upload agent"`): + Message to commit while pushing. + private (`bool`, *optional*, defaults to `None`): + Whether to make the repo private. If `None`, the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. + token (`bool` or `str`, *optional*): + The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + create_pr (`bool`, *optional*, defaults to `False`): + Whether to create a PR with the uploaded files or directly commit. + """ + repo_url = create_repo( + repo_id=repo_id, + token=token, + private=private, + exist_ok=True, + repo_type="space", + space_sdk="gradio", + ) + repo_id = repo_url.repo_id + metadata_update( + repo_id, + {"tags": ["smolagents", "agent"]}, + repo_type="space", + token=token, + overwrite=True, + ) + + with tempfile.TemporaryDirectory() as work_dir: + self.save(work_dir) + logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}") + return upload_folder( + repo_id=repo_id, + commit_message=commit_message, + folder_path=work_dir, + token=token, + create_pr=create_pr, + repo_type="space", ) @@ -749,48 +1001,56 @@ class ToolCallingAgent(MultiStepAgent): Args: tools (`list[Tool]`): [`Tool`]s that the agent can use. model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions. - system_prompt (`str`, *optional*): System prompt that will be used to generate the agent's actions. + prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates. planning_interval (`int`, *optional*): Interval at which the agent will run a planning step. **kwargs: Additional keyword arguments. - """ def __init__( self, tools: List[Tool], model: Callable[[List[Dict[str, str]]], ChatMessage], - system_prompt: Optional[str] = None, + prompt_templates: Optional[PromptTemplates] = None, planning_interval: Optional[int] = None, **kwargs, ): - if system_prompt is None: - system_prompt = TOOL_CALLING_SYSTEM_PROMPT + prompt_templates = prompt_templates or yaml.safe_load( + importlib.resources.files("smolagents.prompts").joinpath("toolcalling_agent.yaml").read_text() + ) super().__init__( tools=tools, model=model, - system_prompt=system_prompt, + prompt_templates=prompt_templates, planning_interval=planning_interval, **kwargs, ) - def step(self, log_entry: ActionStep) -> Union[None, Any]: + def initialize_system_prompt(self) -> str: + system_prompt = populate_template( + self.prompt_templates["system_prompt"], + variables={"tools": self.tools, "managed_agents": self.managed_agents}, + ) + return system_prompt + + def step(self, memory_step: ActionStep) -> Union[None, Any]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. Returns None if the step is not final. """ - agent_memory = self.write_inner_memory_from_logs() + memory_messages = self.write_memory_to_messages() - self.input_messages = agent_memory + self.input_messages = memory_messages # Add new step in logs - log_entry.agent_memory = agent_memory.copy() + memory_step.model_input_messages = memory_messages.copy() try: - model_message = self.model( - self.input_messages, + model_message: ChatMessage = self.model( + memory_messages, tools_to_call_from=list(self.tools.values()), stop_sequences=["Observation:"], ) + memory_step.model_output_message = model_message if model_message.tool_calls is None or len(model_message.tool_calls) == 0: raise Exception("Model did not call any tools. Call `final_answer` tool to return a final answer.") tool_call = model_message.tool_calls[0] @@ -798,9 +1058,9 @@ def step(self, log_entry: ActionStep) -> Union[None, Any]: tool_arguments = tool_call.function.arguments except Exception as e: - raise AgentGenerationError(f"Error in generating tool call with model:\n{e}", self.logger) + raise AgentGenerationError(f"Error in generating tool call with model:\n{e}", self.logger) from e - log_entry.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)] + memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)] # Execute self.logger.log( @@ -830,7 +1090,7 @@ def step(self, log_entry: ActionStep) -> Union[None, Any]: level=LogLevel.INFO, ) - log_entry.action_output = final_answer + memory_step.action_output = final_answer return final_answer else: if tool_arguments is None: @@ -852,7 +1112,7 @@ def step(self, log_entry: ActionStep) -> Union[None, Any]: f"Observations: {updated_information.replace('[', '|')}", # escape potential rich-tag-like components level=LogLevel.INFO, ) - log_entry.observations = updated_information + memory_step.observations = updated_information return None @@ -863,7 +1123,7 @@ class CodeAgent(MultiStepAgent): Args: tools (`list[Tool]`): [`Tool`]s that the agent can use. model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions. - system_prompt (`str`, *optional*): System prompt that will be used to generate the agent's actions. + prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates. grammar (`dict[str, str]`, *optional*): Grammar used to parse the LLM output. additional_authorized_imports (`list[str]`, *optional*): Additional authorized imports for the agent. planning_interval (`int`, *optional*): Interval at which the agent will run a planning step. @@ -877,7 +1137,7 @@ def __init__( self, tools: List[Tool], model: Callable[[List[Dict[str, str]]], ChatMessage], - system_prompt: Optional[str] = None, + prompt_templates: Optional[PromptTemplates] = None, grammar: Optional[Dict[str, str]] = None, additional_authorized_imports: Optional[List[str]] = None, planning_interval: Optional[int] = None, @@ -885,17 +1145,17 @@ def __init__( max_print_outputs_length: Optional[int] = None, **kwargs, ): - if system_prompt is None: - system_prompt = CODE_SYSTEM_PROMPT - self.additional_authorized_imports = additional_authorized_imports if additional_authorized_imports else [] self.authorized_imports = list(set(BASE_BUILTIN_MODULES) | set(self.additional_authorized_imports)) - if "{{authorized_imports}}" not in system_prompt: - raise ValueError("Tag '{{authorized_imports}}' should be provided in the prompt.") + self.use_e2b_executor = use_e2b_executor + self.max_print_outputs_length = max_print_outputs_length + prompt_templates = prompt_templates or yaml.safe_load( + importlib.resources.files("smolagents.prompts").joinpath("code_agent.yaml").read_text() + ) super().__init__( tools=tools, model=model, - system_prompt=system_prompt, + prompt_templates=prompt_templates, grammar=grammar, planning_interval=planning_interval, **kwargs, @@ -925,88 +1185,68 @@ def __init__( max_print_outputs_length=max_print_outputs_length, ) - def initialize_system_prompt(self): - super().initialize_system_prompt() - self.system_prompt = self.system_prompt.replace( - "{{authorized_imports}}", - ( - "You can import from any package you want." - if "*" in self.authorized_imports - else str(self.authorized_imports) - ), + def initialize_system_prompt(self) -> str: + system_prompt = populate_template( + self.prompt_templates["system_prompt"], + variables={ + "tools": self.tools, + "managed_agents": self.managed_agents, + "authorized_imports": ( + "You can import from any package you want." + if "*" in self.authorized_imports + else str(self.authorized_imports) + ), + }, ) - return self.system_prompt + return system_prompt - def step(self, log_entry: ActionStep) -> Union[None, Any]: + def step(self, memory_step: ActionStep) -> Union[None, Any]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. Returns None if the step is not final. """ - agent_memory = self.write_inner_memory_from_logs() + memory_messages = self.write_memory_to_messages() - self.input_messages = agent_memory.copy() + self.input_messages = memory_messages.copy() # Add new step in logs - log_entry.agent_memory = agent_memory.copy() + memory_step.model_input_messages = memory_messages.copy() try: additional_args = {"grammar": self.grammar} if self.grammar is not None else {} - llm_output = self.model( + chat_message: ChatMessage = self.model( self.input_messages, stop_sequences=["", "Observation:"], **additional_args, - ).content - log_entry.llm_output = llm_output + ) + memory_step.model_output_message = chat_message + model_output = chat_message.content + memory_step.model_output = model_output except Exception as e: raise AgentGenerationError(f"Error in generating model output:\n{e}", self.logger) from e - self.logger.log( - Group( - Rule( - "[italic]Output message of the LLM:", - align="left", - style="orange", - ), - Syntax( - llm_output, - lexer="markdown", - theme="github-dark", - word_wrap=True, - ), - ), + self.logger.log_markdown( + content=model_output, + title="Output message of the LLM:", level=LogLevel.DEBUG, ) # Parse try: - code_action = fix_final_answer_code(parse_code_blobs(llm_output)) + code_action = fix_final_answer_code(parse_code_blobs(model_output)) except Exception as e: error_msg = f"Error in code parsing:\n{e}\nMake sure to provide correct code blobs." raise AgentParsingError(error_msg, self.logger) - log_entry.tool_calls = [ + memory_step.tool_calls = [ ToolCall( name="python_interpreter", arguments=code_action, - id=f"call_{len(self.logs)}", + id=f"call_{len(self.memory.steps)}", ) ] # Execute - self.logger.log( - Panel( - Syntax( - code_action, - lexer="python", - theme="monokai", - word_wrap=True, - ), - title="[bold]Executing this code:", - title_align="left", - box=box.HORIZONTALS, - ), - level=LogLevel.INFO, - ) - observation = "" + self.logger.log_code(title="Executing parsed code:", content=code_action, level=LogLevel.INFO) is_final_answer = False try: output, execution_logs, is_final_answer = self.python_executor( @@ -1019,8 +1259,17 @@ def step(self, log_entry: ActionStep) -> Union[None, Any]: Text("Execution logs:", style="bold"), Text(execution_logs), ] - observation += "Execution logs:\n" + execution_logs + observation = "Execution logs:\n" + execution_logs except Exception as e: + if hasattr(self.python_executor, "state") and "_print_outputs" in self.python_executor.state: + execution_logs = str(self.python_executor.state["_print_outputs"]) + if len(execution_logs) > 0: + execution_outputs_console = [ + Text("Execution logs:", style="bold"), + Text(execution_logs), + ] + memory_step.observations = "Execution logs:\n" + execution_logs + self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO) error_msg = str(e) if "Import of " in error_msg and " is not allowed" in error_msg: self.logger.log( @@ -1031,7 +1280,7 @@ def step(self, log_entry: ActionStep) -> Union[None, Any]: truncated_output = truncate_content(str(output)) observation += "Last output from code snippet:\n" + truncated_output - log_entry.observations = observation + memory_step.observations = observation execution_outputs_console += [ Text( @@ -1040,68 +1289,5 @@ def step(self, log_entry: ActionStep) -> Union[None, Any]: ), ] self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO) - log_entry.action_output = output + memory_step.action_output = output return output if is_final_answer else None - - -class ManagedAgent: - """ - ManagedAgent class that manages an agent and provides additional prompting and run summaries. - - Args: - agent (`object`): The agent to be managed. - name (`str`): The name of the managed agent. - description (`str`): A description of the managed agent. - additional_prompting (`Optional[str]`, *optional*): Additional prompting for the managed agent. Defaults to None. - provide_run_summary (`bool`, *optional*): Whether to provide a run summary after the agent completes its task. Defaults to False. - managed_agent_prompt (`Optional[str]`, *optional*): Custom prompt for the managed agent. Defaults to None. - - """ - - def __init__( - self, - agent, - name, - description, - additional_prompting: Optional[str] = None, - provide_run_summary: bool = False, - managed_agent_prompt: Optional[str] = None, - ): - self.agent = agent - self.name = name - self.description = description - self.additional_prompting = additional_prompting - self.provide_run_summary = provide_run_summary - self.managed_agent_prompt = managed_agent_prompt if managed_agent_prompt else MANAGED_AGENT_PROMPT - - def write_full_task(self, task): - """Adds additional prompting for the managed agent, like 'add more detail in your answer'.""" - full_task = self.managed_agent_prompt.format(name=self.name, task=task) - if self.additional_prompting: - full_task = full_task.replace("\n{additional_prompting}", self.additional_prompting).strip() - else: - full_task = full_task.replace("\n{additional_prompting}", "").strip() - return full_task - - def __call__(self, request, **kwargs): - full_task = self.write_full_task(request) - output = self.agent.run(full_task, **kwargs) - if self.provide_run_summary: - answer = f"Here is the final answer from your managed agent '{self.name}':\n" - answer += str(output) - answer += f"\n\nFor more detail, find below a summary of this agent's work:\nSUMMARY OF WORK FROM AGENT '{self.name}':\n" - for message in self.agent.write_inner_memory_from_logs(summary_mode=True): - content = message["content"] - answer += "\n" + truncate_content(str(content)) + "\n---" - answer += f"\nEND OF SUMMARY OF WORK FROM AGENT '{self.name}'." - return answer - else: - return output - - -__all__ = [ - "ManagedAgent", - "MultiStepAgent", - "CodeAgent", - "ToolCallingAgent", -] diff --git a/src/smolagents/cli.py b/src/smolagents/cli.py new file mode 100644 index 000000000..bcf984532 --- /dev/null +++ b/src/smolagents/cli.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# coding=utf-8 + +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os + +from dotenv import load_dotenv + +from smolagents import CodeAgent, HfApiModel, LiteLLMModel, Model, OpenAIServerModel, Tool, TransformersModel +from smolagents.default_tools import TOOL_MAPPING + + +leopard_prompt = "How many seconds would it take for a leopard at full speed to run through Pont des Arts?" + + +def parse_arguments(description): + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "prompt", + type=str, + nargs="?", # Makes it optional + default=leopard_prompt, + help="The prompt to run with the agent", + ) + parser.add_argument( + "--model-type", + type=str, + default="HfApiModel", + help="The model type to use (e.g., HfApiModel, OpenAIServerModel, LiteLLMModel, TransformersModel)", + ) + parser.add_argument( + "--model-id", + type=str, + default="Qwen/Qwen2.5-Coder-32B-Instruct", + help="The model ID to use for the specified model type", + ) + parser.add_argument( + "--imports", + nargs="*", # accepts zero or more arguments + default=[], + help="Space-separated list of imports to authorize (e.g., 'numpy pandas')", + ) + parser.add_argument( + "--tools", + nargs="*", + default=["web_search"], + help="Space-separated list of tools that the agent can use (e.g., 'tool1 tool2 tool3')", + ) + parser.add_argument( + "--verbosity-level", + type=int, + default=1, + help="The verbosity level, as an int in [0, 1, 2].", + ) + group = parser.add_argument_group("api options", "Options for API-based model types") + group.add_argument( + "--api-base", + type=str, + help="The base URL for the model", + ) + group.add_argument( + "--api-key", + type=str, + help="The API key for the model", + ) + return parser.parse_args() + + +def load_model(model_type: str, model_id: str, api_base: str | None, api_key: str | None) -> Model: + if model_type == "OpenAIServerModel": + return OpenAIServerModel( + api_key=api_key or os.getenv("FIREWORKS_API_KEY"), + api_base=api_base or "https://api.fireworks.ai/inference/v1", + model_id=model_id, + ) + elif model_type == "LiteLLMModel": + return LiteLLMModel( + model_id=model_id, + api_key=api_key or os.getenv("OPENAI_API_KEY"), + api_base=api_base, + ) + elif model_type == "TransformersModel": + return TransformersModel(model_id=model_id, device_map="auto", flatten_messages_as_text=False) + elif model_type == "HfApiModel": + return HfApiModel( + token=api_key or os.getenv("HF_API_KEY"), + model_id=model_id, + ) + else: + raise ValueError(f"Unsupported model type: {model_type}") + + +def main(): + load_dotenv() + + args = parse_arguments(description="Run a CodeAgent with all specified parameters") + + model = load_model(args.model_type, args.model_id, args.api_base, args.api_key) + + available_tools = [] + for tool_name in args.tools: + if "/" in tool_name: + available_tools.append(Tool.from_space(tool_name)) + else: + if tool_name in TOOL_MAPPING: + available_tools.append(TOOL_MAPPING[tool_name]()) + else: + raise ValueError(f"Tool {tool_name} is not recognized either as a default tool or a Space.") + + print(f"Running agent with these tools: {args.tools}") + agent = CodeAgent(tools=available_tools, model=model, additional_authorized_imports=args.imports) + + agent.run(args.prompt) + + +if __name__ == "__main__": + main() diff --git a/src/smolagents/default_tools.py b/src/smolagents/default_tools.py index 3f3af93e7..2ea7834f6 100644 --- a/src/smolagents/default_tools.py +++ b/src/smolagents/default_tools.py @@ -24,7 +24,6 @@ evaluate_python_code, ) from .tools import PipelineTool, Tool -from .types import AgentAudio @dataclass @@ -76,7 +75,7 @@ def forward(self, code: str) -> str: authorized_imports=self.authorized_imports, )[0] # The second element is boolean is_final_answer ) - return f"Stdout:\n{state['print_outputs']}\nOutput: {output}" + return f"Stdout:\n{str(state['_print_outputs'])}\nOutput: {output}" class FinalAnswerTool(Tool): @@ -106,8 +105,8 @@ class DuckDuckGoSearchTool(Tool): inputs = {"query": {"type": "string", "description": "The search query to perform."}} output_type = "string" - def __init__(self, *args, max_results=10, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, max_results=10, **kwargs): + super().__init__() self.max_results = max_results try: from duckduckgo_search import DDGS @@ -115,7 +114,7 @@ def __init__(self, *args, max_results=10, **kwargs): raise ImportError( "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`." ) from e - self.ddgs = DDGS() + self.ddgs = DDGS(**kwargs) def forward(self, query: str) -> str: results = self.ddgs.text(query, max_results=self.max_results) @@ -138,48 +137,62 @@ class GoogleSearchTool(Tool): } output_type = "string" - def __init__(self): + def __init__(self, provider: str = "serpapi"): super().__init__(self) import os - self.serpapi_key = os.getenv("SERPAPI_API_KEY") + self.provider = provider + if provider == "serpapi": + self.organic_key = "organic_results" + api_key_env_name = "SERPAPI_API_KEY" + else: + self.organic_key = "organic" + api_key_env_name = "SERPER_API_KEY" + self.api_key = os.getenv(api_key_env_name) + if self.api_key is None: + raise ValueError(f"Missing API key. Make sure you have '{api_key_env_name}' in your env variables.") def forward(self, query: str, filter_year: Optional[int] = None) -> str: import requests - if self.serpapi_key is None: - raise ValueError("Missing SerpAPI key. Make sure you have 'SERPAPI_API_KEY' in your env variables.") - - params = { - "engine": "google", - "q": query, - "api_key": self.serpapi_key, - "google_domain": "google.com", - } + if self.provider == "serpapi": + params = { + "q": query, + "api_key": self.api_key, + "engine": "google", + "google_domain": "google.com", + } + base_url = "https://serpapi.com/search.json" + else: + params = { + "q": query, + "api_key": self.api_key, + } + base_url = "https://google.serper.dev/search" if filter_year is not None: params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}" - response = requests.get("https://serpapi.com/search.json", params=params) + response = requests.get(base_url, params=params) if response.status_code == 200: results = response.json() else: raise ValueError(response.json()) - if "organic_results" not in results.keys(): + if self.organic_key not in results.keys(): if filter_year is not None: raise Exception( - f"'organic_results' key not found for query: '{query}' with filtering on year={filter_year}. Use a less restrictive query or do not filter on year." + f"No results found for query: '{query}' with filtering on year={filter_year}. Use a less restrictive query or do not filter on year." ) else: - raise Exception(f"'organic_results' key not found for query: '{query}'. Use a less restrictive query.") - if len(results["organic_results"]) == 0: + raise Exception(f"No results found for query: '{query}'. Use a less restrictive query.") + if len(results[self.organic_key]) == 0: year_filter_message = f" with filter year={filter_year}" if filter_year is not None else "" return f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter." web_snippets = [] - if "organic_results" in results: - for idx, page in enumerate(results["organic_results"]): + if self.organic_key in results: + for idx, page in enumerate(results[self.organic_key]): date_published = "" if "date" in page: date_published = "\nDate published: " + page["date"] @@ -193,8 +206,6 @@ def forward(self, query: str, filter_year: Optional[int] = None) -> str: snippet = "\n" + page["snippet"] redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{snippet}" - - redacted_version = redacted_version.replace("Your browser can't play this video.", "") web_snippets.append(redacted_version) return "## Search Results\n" + "\n\n".join(web_snippets) @@ -257,19 +268,19 @@ class SpeechToTextTool(PipelineTool): } output_type = "string" - def __new__(cls): + def __new__(cls, *args, **kwargs): from transformers.models.whisper import ( WhisperForConditionalGeneration, WhisperProcessor, ) - if not hasattr(cls, "pre_processor_class"): - cls.pre_processor_class = WhisperProcessor - if not hasattr(cls, "model_class"): - cls.model_class = WhisperForConditionalGeneration - return super().__new__() + cls.pre_processor_class = WhisperProcessor + cls.model_class = WhisperForConditionalGeneration + return super().__new__(cls, *args, **kwargs) def encode(self, audio): + from .agent_types import AgentAudio + audio = AgentAudio(audio).to_raw() return self.pre_processor(audio, return_tensors="pt") diff --git a/src/smolagents/e2b_executor.py b/src/smolagents/e2b_executor.py index 404a8e26e..10b0170ee 100644 --- a/src/smolagents/e2b_executor.py +++ b/src/smolagents/e2b_executor.py @@ -16,6 +16,7 @@ # limitations under the License. import base64 import pickle +import re import textwrap from io import BytesIO from typing import Any, List, Tuple @@ -37,14 +38,19 @@ class E2BExecutor: def __init__(self, additional_imports: List[str], tools: List[Tool], logger): + self.logger = logger try: from e2b_code_interpreter import Sandbox except ModuleNotFoundError: raise ModuleNotFoundError( """Please install 'e2b' extra to use E2BExecutor: `pip install "smolagents[e2b]"`""" ) + self.logger = logger + self.logger.log("Initializing E2B executor, hold on...") self.custom_tools = {} + self.final_answer = False + self.final_answer_pattern = re.compile(r"final_answer\((.*?)\)") self.sbx = Sandbox() # "qywp2ctmu2q7jzprcf4j") # TODO: validate installing agents package or not # print("Installing agents package on remote executor...") @@ -53,7 +59,6 @@ def __init__(self, additional_imports: List[str], tools: List[Tool], logger): # timeout=300 # ) # print("Installation of agents package finished.") - self.logger = logger additional_imports = additional_imports + ["smolagents"] if len(additional_imports) > 0: execution = self.sbx.commands.run("pip install " + " ".join(additional_imports)) @@ -71,20 +76,24 @@ def __init__(self, additional_imports: List[str], tools: List[Tool], logger): tool_codes.append(tool_code) tool_definition_code = "\n".join([f"import {module}" for module in BASE_BUILTIN_MODULES]) - tool_definition_code += textwrap.dedent(""" + tool_definition_code += textwrap.dedent( + """ class Tool: def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) def forward(self, *args, **kwargs): pass # to be implemented in child class - """) + """ + ) tool_definition_code += "\n\n".join(tool_codes) tool_definition_execution = self.run_code_raise_errors(tool_definition_code) self.logger.log(tool_definition_execution.logs) def run_code_raise_errors(self, code: str): + if self.final_answer_pattern.search(code) is not None: + self.final_answer = True execution = self.sbx.run_code( code, ) @@ -122,7 +131,7 @@ def __call__(self, code_action: str, additional_args: dict) -> Tuple[Any, Any]: execution = self.run_code_raise_errors(code_action) execution_logs = "\n".join([str(log) for log in execution.logs.stdout]) if not execution.results: - return None, execution_logs + return None, execution_logs, self.final_answer else: for result in execution.results: if result.is_main_result: @@ -130,7 +139,7 @@ def __call__(self, code_action: str, additional_args: dict) -> Tuple[Any, Any]: if getattr(result, attribute_name) is not None: image_output = getattr(result, attribute_name) decoded_bytes = base64.b64decode(image_output.encode("utf-8")) - return Image.open(BytesIO(decoded_bytes)), execution_logs + return Image.open(BytesIO(decoded_bytes)), execution_logs, self.final_answer for attribute_name in [ "chart", "data", @@ -144,8 +153,10 @@ def __call__(self, code_action: str, additional_args: dict) -> Tuple[Any, Any]: "text", ]: if getattr(result, attribute_name) is not None: - return getattr(result, attribute_name), execution_logs - raise ValueError("No main result returned by executor!") + return getattr(result, attribute_name), execution_logs, self.final_answer + if self.final_answer: + raise ValueError("No main result returned by executor!") + return None, execution_logs, False __all__ = ["E2BExecutor"] diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index 52f952b75..11094a52c 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -13,42 +13,113 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import mimetypes import os import re import shutil from typing import Optional -from .agents import ActionStep, AgentStepLog, MultiStepAgent -from .types import AgentAudio, AgentImage, AgentText, handle_agent_output_types -from .utils import _is_package_available +from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types +from smolagents.agents import ActionStep, MultiStepAgent +from smolagents.memory import MemoryStep +from smolagents.utils import _is_package_available -def pull_messages_from_step(step_log: AgentStepLog): - """Extract ChatMessage objects from agent steps""" +def pull_messages_from_step( + step_log: MemoryStep, +): + """Extract ChatMessage objects from agent steps with proper nesting""" import gradio as gr if isinstance(step_log, ActionStep): - yield gr.ChatMessage(role="assistant", content=step_log.llm_output or "") - if step_log.tool_calls is not None: + # Output the step number + step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else "" + yield gr.ChatMessage(role="assistant", content=f"**{step_number}**") + + # First yield the thought/reasoning from the LLM + if hasattr(step_log, "model_output") and step_log.model_output is not None: + # Clean up the LLM output + model_output = step_log.model_output.strip() + # Remove any trailing and extra backticks, handling multiple possible formats + model_output = re.sub(r"```\s*", "```", model_output) # handles ``` + model_output = re.sub(r"\s*```", "```", model_output) # handles ``` + model_output = re.sub(r"```\s*\n\s*", "```", model_output) # handles ```\n + model_output = model_output.strip() + yield gr.ChatMessage(role="assistant", content=model_output) + + # For tool calls, create a parent message + if hasattr(step_log, "tool_calls") and step_log.tool_calls is not None: first_tool_call = step_log.tool_calls[0] - used_code = first_tool_call.name == "code interpreter" - content = first_tool_call.arguments + used_code = first_tool_call.name == "python_interpreter" + parent_id = f"call_{len(step_log.tool_calls)}" + + # Tool call becomes the parent message with timing info + # First we will handle arguments based on type + args = first_tool_call.arguments + if isinstance(args, dict): + content = str(args.get("answer", str(args))) + else: + content = str(args).strip() + if used_code: - content = f"```py\n{content}\n```" - yield gr.ChatMessage( + # Clean up the content by removing any end code tags + content = re.sub(r"```.*?\n", "", content) # Remove existing code blocks + content = re.sub(r"\s*\s*", "", content) # Remove end_code tags + content = content.strip() + if not content.startswith("```python"): + content = f"```python\n{content}\n```" + + parent_message_tool = gr.ChatMessage( role="assistant", - metadata={"title": f"🛠️ Used tool {first_tool_call.name}"}, - content=str(content), + content=content, + metadata={ + "title": f"🛠️ Used tool {first_tool_call.name}", + "id": parent_id, + "status": "pending", + }, ) - if step_log.observations is not None: - yield gr.ChatMessage(role="assistant", content=step_log.observations) - if step_log.error is not None: - yield gr.ChatMessage( - role="assistant", - content=str(step_log.error), - metadata={"title": "💥 Error"}, + yield parent_message_tool + + # Nesting execution logs under the tool call if they exist + if hasattr(step_log, "observations") and ( + step_log.observations is not None and step_log.observations.strip() + ): # Only yield execution logs if there's actual content + log_content = step_log.observations.strip() + if log_content: + log_content = re.sub(r"^Execution logs:\s*", "", log_content) + yield gr.ChatMessage( + role="assistant", + content=f"{log_content}", + metadata={"title": "📝 Execution Logs", "parent_id": parent_id, "status": "done"}, + ) + + # Nesting any errors under the tool call + if hasattr(step_log, "error") and step_log.error is not None: + yield gr.ChatMessage( + role="assistant", + content=str(step_log.error), + metadata={"title": "💥 Error", "parent_id": parent_id, "status": "done"}, + ) + + # Update parent message metadata to done status without yielding a new message + parent_message_tool.metadata["status"] = "done" + + # Handle standalone errors but not from tool calls + elif hasattr(step_log, "error") and step_log.error is not None: + yield gr.ChatMessage(role="assistant", content=str(step_log.error), metadata={"title": "💥 Error"}) + + # Calculate duration and token information + step_footnote = f"{step_number}" + if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"): + token_str = ( + f" | Input-tokens:{step_log.input_token_count:,} | Output-tokens:{step_log.output_token_count:,}" ) + step_footnote += token_str + if hasattr(step_log, "duration"): + step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None + step_footnote += step_duration + step_footnote = f"""{step_footnote} """ + yield gr.ChatMessage(role="assistant", content=f"{step_footnote}") + yield gr.ChatMessage(role="assistant", content="-----") def stream_to_gradio( @@ -60,12 +131,25 @@ def stream_to_gradio( """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages.""" if not _is_package_available("gradio"): raise ModuleNotFoundError( - "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[audio]'`" + "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" ) import gradio as gr + total_input_tokens = 0 + total_output_tokens = 0 + for step_log in agent.run(task, stream=True, reset=reset_agent_memory, additional_args=additional_args): - for message in pull_messages_from_step(step_log): + # Track tokens if model provides them + if getattr(agent.model, "last_input_token_count", None) is not None: + total_input_tokens += agent.model.last_input_token_count + total_output_tokens += agent.model.last_output_token_count + if isinstance(step_log, ActionStep): + step_log.input_token_count = agent.model.last_input_token_count + step_log.output_token_count = agent.model.last_output_token_count + + for message in pull_messages_from_step( + step_log, + ): yield message final_answer = step_log # Last log is the run's final_answer @@ -87,7 +171,7 @@ def stream_to_gradio( content={"path": final_answer.to_string(), "mime_type": "audio/wav"}, ) else: - yield gr.ChatMessage(role="assistant", content=str(final_answer)) + yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}") class GradioUI: @@ -96,7 +180,7 @@ class GradioUI: def __init__(self, agent: MultiStepAgent, file_upload_folder: str | None = None): if not _is_package_available("gradio"): raise ModuleNotFoundError( - "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[audio]'`" + "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" ) self.agent = agent self.file_upload_folder = file_upload_folder @@ -114,30 +198,20 @@ def interact_with_agent(self, prompt, messages): yield messages yield messages - def upload_file( - self, - file, - file_uploads_log, - allowed_file_types=[ - "application/pdf", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "text/plain", - ], - ): + def upload_file(self, file, file_uploads_log, allowed_file_types=None): """ Handle file uploads, default allowed types are .pdf, .docx, and .txt """ import gradio as gr if file is None: - return gr.Textbox("No file uploaded", visible=True), file_uploads_log + return gr.Textbox(value="No file uploaded", visible=True), file_uploads_log - try: - mime_type, _ = mimetypes.guess_type(file.name) - except Exception as e: - return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log + if allowed_file_types is None: + allowed_file_types = [".pdf", ".docx", ".txt"] - if mime_type not in allowed_file_types: + file_ext = os.path.splitext(file.name)[1].lower() + if file_ext not in allowed_file_types: return gr.Textbox("File type disallowed", visible=True), file_uploads_log # Sanitize file name @@ -146,16 +220,6 @@ def upload_file( r"[^\w\-.]", "_", original_name ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores - type_to_ext = {} - for ext, t in mimetypes.types_map.items(): - if t not in type_to_ext: - type_to_ext[t] = ext - - # Ensure the extension correlates to the mime type - sanitized_name = sanitized_name.split(".")[:-1] - sanitized_name.append("" + type_to_ext[mime_type]) - sanitized_name = "".join(sanitized_name) - # Save the uploaded file to the specified folder file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name)) shutil.copy(file.name, file_path) @@ -173,10 +237,10 @@ def log_user_message(self, text_input, file_uploads_log): "", ) - def launch(self): + def launch(self, share: bool = False, **kwargs): import gradio as gr - with gr.Blocks() as demo: + with gr.Blocks(fill_height=True) as demo: stored_messages = gr.State([]) file_uploads_log = gr.State([]) chatbot = gr.Chatbot( @@ -187,6 +251,7 @@ def launch(self): "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png", ), resizeable=True, + scale=1, ) # If an upload folder is provided, enable the upload feature if self.file_upload_folder is not None: @@ -204,7 +269,7 @@ def launch(self): [stored_messages, text_input], ).then(self.interact_with_agent, [stored_messages, chatbot], [chatbot]) - demo.launch() + demo.launch(debug=True, share=share, **kwargs) __all__ = ["stream_to_gradio", "GradioUI"] diff --git a/src/smolagents/local_python_executor.py b/src/smolagents/local_python_executor.py index a4f046dca..a48e1e11e 100644 --- a/src/smolagents/local_python_executor.py +++ b/src/smolagents/local_python_executor.py @@ -18,6 +18,7 @@ import builtins import difflib import inspect +import logging import math import re from collections.abc import Mapping @@ -31,6 +32,9 @@ from .utils import BASE_BUILTIN_MODULES, truncate_content +logger = logging.getLogger(__name__) + + class InterpreterError(ValueError): """ An error raised when the interpreter cannot evaluate a Python expression, due to syntax error or unsupported @@ -46,8 +50,9 @@ class InterpreterError(ValueError): if isinstance(getattr(builtins, name), type) and issubclass(getattr(builtins, name), BaseException) } -PRINT_OUTPUTS, DEFAULT_MAX_LEN_OUTPUT = "", 50000 -OPERATIONS_COUNT, MAX_OPERATIONS = 0, 10000000 +DEFAULT_MAX_LEN_OUTPUT = 50000 +MAX_OPERATIONS = 10000000 +MAX_WHILE_ITERATIONS = 1000000 def custom_print(*args): @@ -80,7 +85,7 @@ def custom_print(*args): "atan2": math.atan2, "degrees": math.degrees, "radians": math.radians, - "pow": math.pow, + "pow": pow, "sqrt": math.sqrt, "len": len, "sum": sum, @@ -109,6 +114,52 @@ def custom_print(*args): "complex": complex, } +DANGEROUS_PATTERNS = ( + "_os", + "os", + "subprocess", + "_subprocess", + "pty", + "system", + "popen", + "spawn", + "shutil", + "sys", + "pathlib", + "io", + "socket", + "compile", + "eval", + "exec", + "multiprocessing", +) + + +class PrintContainer: + def __init__(self): + self.value = "" + + def append(self, text): + self.value += text + return self + + def __iadd__(self, other): + """Implements the += operator""" + self.value += str(other) + return self + + def __str__(self): + """String representation""" + return self.value + + def __repr__(self): + """Representation for debugging""" + return f"PrintContainer({self.value})" + + def __len__(self): + """Implements len() function support""" + return len(self.value) + class BreakException(Exception): pass @@ -211,7 +262,6 @@ def evaluate_while( custom_tools: Dict[str, Callable], authorized_imports: List[str], ) -> None: - max_iterations = 1000 iterations = 0 while evaluate_ast(while_loop.test, state, static_tools, custom_tools, authorized_imports): for node in while_loop.body: @@ -222,8 +272,8 @@ def evaluate_while( except ContinueException: break iterations += 1 - if iterations > max_iterations: - raise InterpreterError(f"Maximum number of {max_iterations} iterations in While loop exceeded") + if iterations > MAX_WHILE_ITERATIONS: + raise InterpreterError(f"Maximum number of {MAX_WHILE_ITERATIONS} iterations in While loop exceeded") return None @@ -599,10 +649,7 @@ def evaluate_call( raise InterpreterError("super() takes at most 2 arguments") else: if func_name == "print": - output = " ".join(map(str, args)) - global PRINT_OUTPUTS - PRINT_OUTPUTS += output + "\n" - # cap the number of lines + state["_print_outputs"] += " ".join(map(str, args)) + "\n" return None else: # Assume it's a callable object if ( @@ -686,47 +733,40 @@ def evaluate_condition( static_tools: Dict[str, Callable], custom_tools: Dict[str, Callable], authorized_imports: List[str], -) -> bool: - left = evaluate_ast(condition.left, state, static_tools, custom_tools, authorized_imports) - comparators = [ - evaluate_ast(c, state, static_tools, custom_tools, authorized_imports) for c in condition.comparators - ] - ops = [type(op) for op in condition.ops] - +) -> bool | object: result = True - current_left = left - - for op, comparator in zip(ops, comparators): + left = evaluate_ast(condition.left, state, static_tools, custom_tools, authorized_imports) + for i, (op, comparator) in enumerate(zip(condition.ops, condition.comparators)): + op = type(op) + right = evaluate_ast(comparator, state, static_tools, custom_tools, authorized_imports) if op == ast.Eq: - current_result = current_left == comparator + current_result = left == right elif op == ast.NotEq: - current_result = current_left != comparator + current_result = left != right elif op == ast.Lt: - current_result = current_left < comparator + current_result = left < right elif op == ast.LtE: - current_result = current_left <= comparator + current_result = left <= right elif op == ast.Gt: - current_result = current_left > comparator + current_result = left > right elif op == ast.GtE: - current_result = current_left >= comparator + current_result = left >= right elif op == ast.Is: - current_result = current_left is comparator + current_result = left is right elif op == ast.IsNot: - current_result = current_left is not comparator + current_result = left is not right elif op == ast.In: - current_result = current_left in comparator + current_result = left in right elif op == ast.NotIn: - current_result = current_left not in comparator + current_result = left not in right else: - raise InterpreterError(f"Operator not supported: {op}") - - result = result & current_result - current_left = comparator + raise InterpreterError(f"Unsupported comparison operator: {op}") - if isinstance(result, bool) and not result: - break - - return result if isinstance(result, (bool, pd.Series)) else result.all() + if current_result is False: + return False + result = current_result if i == 0 else (result and current_result) + left = right + return result def evaluate_if( @@ -934,86 +974,80 @@ def evaluate_with( context.__exit__(None, None, None) -def get_safe_module(unsafe_module, dangerous_patterns, visited=None): +def get_safe_module(raw_module, authorized_imports, visited=None): """Creates a safe copy of a module or returns the original if it's a function""" # If it's a function or non-module object, return it directly - if not isinstance(unsafe_module, ModuleType): - return unsafe_module + if not isinstance(raw_module, ModuleType): + return raw_module # Handle circular references: Initialize visited set for the first call if visited is None: visited = set() - module_id = id(unsafe_module) + module_id = id(raw_module) if module_id in visited: - return unsafe_module # Return original for circular refs + return raw_module # Return original for circular refs visited.add(module_id) # Create new module for actual modules - safe_module = ModuleType(unsafe_module.__name__) + safe_module = ModuleType(raw_module.__name__) # Copy all attributes by reference, recursively checking modules - for attr_name in dir(unsafe_module): + for attr_name in dir(raw_module): # Skip dangerous patterns at any level - if any(pattern in f"{unsafe_module.__name__}.{attr_name}" for pattern in dangerous_patterns): + if any( + pattern in raw_module.__name__.split(".") + [attr_name] + and not check_module_authorized(pattern, authorized_imports) + for pattern in DANGEROUS_PATTERNS + ): + logger.info(f"Skipping dangerous attribute {raw_module.__name__}.{attr_name}") continue - attr_value = getattr(unsafe_module, attr_name) - + try: + attr_value = getattr(raw_module, attr_name) + except ImportError as e: + # lazy / dynamic loading module -> INFO log and skip + logger.info( + f"Skipping import error while copying {raw_module.__name__}.{attr_name}: {type(e).__name__} - {e}" + ) + continue # Recursively process nested modules, passing visited set if isinstance(attr_value, ModuleType): - attr_value = get_safe_module(attr_value, dangerous_patterns, visited=visited) + attr_value = get_safe_module(attr_value, authorized_imports, visited=visited) setattr(safe_module, attr_name, attr_value) return safe_module -def import_modules(expression, state, authorized_imports): - dangerous_patterns = ( - "_os", - "os", - "subprocess", - "_subprocess", - "pty", - "system", - "popen", - "spawn", - "shutil", - "sys", - "pathlib", - "io", - "socket", - "compile", - "eval", - "exec", - "multiprocessing", - ) +def check_module_authorized(module_name, authorized_imports): + if "*" in authorized_imports: + return True + else: + module_path = module_name.split(".") + if any([module in DANGEROUS_PATTERNS and module not in authorized_imports for module in module_path]): + return False + # ["A", "B", "C"] -> ["A", "A.B", "A.B.C"] + module_subpaths = [".".join(module_path[:i]) for i in range(1, len(module_path) + 1)] + return any(subpath in authorized_imports for subpath in module_subpaths) - def check_module_authorized(module_name): - if "*" in authorized_imports: - return True - else: - module_path = module_name.split(".") - if any([module in dangerous_patterns for module in module_path]): - return False - module_subpaths = [".".join(module_path[:i]) for i in range(1, len(module_path) + 1)] - return any(subpath in authorized_imports for subpath in module_subpaths) +def import_modules(expression, state, authorized_imports): if isinstance(expression, ast.Import): for alias in expression.names: - if check_module_authorized(alias.name): + if check_module_authorized(alias.name, authorized_imports): raw_module = import_module(alias.name) - state[alias.asname or alias.name] = get_safe_module(raw_module, dangerous_patterns) + state[alias.asname or alias.name] = get_safe_module(raw_module, authorized_imports) else: raise InterpreterError( f"Import of {alias.name} is not allowed. Authorized imports are: {str(authorized_imports)}" ) return None elif isinstance(expression, ast.ImportFrom): - if check_module_authorized(expression.module): - module = __import__(expression.module, fromlist=[alias.name for alias in expression.names]) + if check_module_authorized(expression.module, authorized_imports): + raw_module = __import__(expression.module, fromlist=[alias.name for alias in expression.names]) + module = get_safe_module(raw_module, authorized_imports) if expression.names[0].name == "*": # Handle "from module import *" if hasattr(module, "__all__"): # If module has __all__, import only those names for name in module.__all__: @@ -1029,7 +1063,9 @@ def check_module_authorized(module_name): else: raise InterpreterError(f"Module {expression.module} has no attribute {alias.name}") else: - raise InterpreterError(f"Import from {expression.module} is not allowed.") + raise InterpreterError( + f"Import from {expression.module} is not allowed. Authorized imports are: {str(authorized_imports)}" + ) return None @@ -1075,6 +1111,42 @@ def evaluate_dictcomp( return result +def evaluate_delete( + delete_node: ast.Delete, + state: Dict[str, Any], + static_tools: Dict[str, Callable], + custom_tools: Dict[str, Callable], + authorized_imports: List[str], +) -> None: + """ + Evaluate a delete statement (del x, del x[y]). + + Args: + delete_node: The AST Delete node to evaluate + state: The current state dictionary + static_tools: Dictionary of static tools + custom_tools: Dictionary of custom tools + authorized_imports: List of authorized imports + """ + for target in delete_node.targets: + if isinstance(target, ast.Name): + # Handle simple variable deletion (del x) + if target.id in state: + del state[target.id] + else: + raise InterpreterError(f"Cannot delete name '{target.id}': name is not defined") + elif isinstance(target, ast.Subscript): + # Handle index/key deletion (del x[y]) + obj = evaluate_ast(target.value, state, static_tools, custom_tools, authorized_imports) + index = evaluate_ast(target.slice, state, static_tools, custom_tools, authorized_imports) + try: + del obj[index] + except (TypeError, KeyError, IndexError) as e: + raise InterpreterError(f"Cannot delete index/key: {str(e)}") + else: + raise InterpreterError(f"Deletion of {type(target).__name__} targets is not supported") + + def evaluate_ast( expression: ast.AST, state: Dict[str, Any], @@ -1102,130 +1174,124 @@ def evaluate_ast( The list of modules that can be imported by the code. By default, only a few safe modules are allowed. If it contains "*", it will authorize any import. Use this at your own risk! """ - global OPERATIONS_COUNT - if OPERATIONS_COUNT >= MAX_OPERATIONS: + if state.setdefault("_operations_count", 0) >= MAX_OPERATIONS: raise InterpreterError( f"Reached the max number of operations of {MAX_OPERATIONS}. Maybe there is an infinite loop somewhere in the code, or you're just asking too many calculations." ) - OPERATIONS_COUNT += 1 + state["_operations_count"] += 1 + common_params = (state, static_tools, custom_tools, authorized_imports) if isinstance(expression, ast.Assign): # Assignment -> we evaluate the assignment which should update the state # We return the variable assigned as it may be used to determine the final result. - return evaluate_assign(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_assign(expression, *common_params) elif isinstance(expression, ast.AugAssign): - return evaluate_augassign(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_augassign(expression, *common_params) elif isinstance(expression, ast.Call): # Function call -> we return the value of the function call - return evaluate_call(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_call(expression, *common_params) elif isinstance(expression, ast.Constant): # Constant -> just return the value return expression.value elif isinstance(expression, ast.Tuple): - return tuple( - evaluate_ast(elt, state, static_tools, custom_tools, authorized_imports) for elt in expression.elts - ) + return tuple((evaluate_ast(elt, *common_params) for elt in expression.elts)) elif isinstance(expression, (ast.ListComp, ast.GeneratorExp)): - return evaluate_listcomp(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_listcomp(expression, *common_params) elif isinstance(expression, ast.UnaryOp): - return evaluate_unaryop(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_unaryop(expression, *common_params) elif isinstance(expression, ast.Starred): - return evaluate_ast(expression.value, state, static_tools, custom_tools, authorized_imports) + return evaluate_ast(expression.value, *common_params) elif isinstance(expression, ast.BoolOp): # Boolean operation -> evaluate the operation - return evaluate_boolop(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_boolop(expression, *common_params) elif isinstance(expression, ast.Break): raise BreakException() elif isinstance(expression, ast.Continue): raise ContinueException() elif isinstance(expression, ast.BinOp): # Binary operation -> execute operation - return evaluate_binop(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_binop(expression, *common_params) elif isinstance(expression, ast.Compare): # Comparison -> evaluate the comparison - return evaluate_condition(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_condition(expression, *common_params) elif isinstance(expression, ast.Lambda): - return evaluate_lambda(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_lambda(expression, *common_params) elif isinstance(expression, ast.FunctionDef): - return evaluate_function_def(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_function_def(expression, *common_params) elif isinstance(expression, ast.Dict): # Dict -> evaluate all keys and values - keys = [evaluate_ast(k, state, static_tools, custom_tools, authorized_imports) for k in expression.keys] - values = [evaluate_ast(v, state, static_tools, custom_tools, authorized_imports) for v in expression.values] + keys = (evaluate_ast(k, *common_params) for k in expression.keys) + values = (evaluate_ast(v, *common_params) for v in expression.values) return dict(zip(keys, values)) elif isinstance(expression, ast.Expr): # Expression -> evaluate the content - return evaluate_ast(expression.value, state, static_tools, custom_tools, authorized_imports) + return evaluate_ast(expression.value, *common_params) elif isinstance(expression, ast.For): # For loop -> execute the loop - return evaluate_for(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_for(expression, *common_params) elif isinstance(expression, ast.FormattedValue): - # Formatted value (part of f-string) -> evaluate the content and return - return evaluate_ast(expression.value, state, static_tools, custom_tools, authorized_imports) + # Formatted value (part of f-string) -> evaluate the content and format it + value = evaluate_ast(expression.value, *common_params) + # Early return if no format spec + if not expression.format_spec: + return value + # Apply format specification + format_spec = evaluate_ast(expression.format_spec, *common_params) + return format(value, format_spec) elif isinstance(expression, ast.If): # If -> execute the right branch - return evaluate_if(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_if(expression, *common_params) elif hasattr(ast, "Index") and isinstance(expression, ast.Index): - return evaluate_ast(expression.value, state, static_tools, custom_tools, authorized_imports) + return evaluate_ast(expression.value, *common_params) elif isinstance(expression, ast.JoinedStr): - return "".join( - [str(evaluate_ast(v, state, static_tools, custom_tools, authorized_imports)) for v in expression.values] - ) + return "".join([str(evaluate_ast(v, *common_params)) for v in expression.values]) elif isinstance(expression, ast.List): # List -> evaluate all elements - return [evaluate_ast(elt, state, static_tools, custom_tools, authorized_imports) for elt in expression.elts] + return [evaluate_ast(elt, *common_params) for elt in expression.elts] elif isinstance(expression, ast.Name): # Name -> pick up the value in the state - return evaluate_name(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_name(expression, *common_params) elif isinstance(expression, ast.Subscript): # Subscript -> return the value of the indexing - return evaluate_subscript(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_subscript(expression, *common_params) elif isinstance(expression, ast.IfExp): - test_val = evaluate_ast(expression.test, state, static_tools, custom_tools, authorized_imports) + test_val = evaluate_ast(expression.test, *common_params) if test_val: - return evaluate_ast(expression.body, state, static_tools, custom_tools, authorized_imports) + return evaluate_ast(expression.body, *common_params) else: - return evaluate_ast(expression.orelse, state, static_tools, custom_tools, authorized_imports) + return evaluate_ast(expression.orelse, *common_params) elif isinstance(expression, ast.Attribute): - value = evaluate_ast(expression.value, state, static_tools, custom_tools, authorized_imports) + value = evaluate_ast(expression.value, *common_params) return getattr(value, expression.attr) elif isinstance(expression, ast.Slice): return slice( - evaluate_ast(expression.lower, state, static_tools, custom_tools, authorized_imports) - if expression.lower is not None - else None, - evaluate_ast(expression.upper, state, static_tools, custom_tools, authorized_imports) - if expression.upper is not None - else None, - evaluate_ast(expression.step, state, static_tools, custom_tools, authorized_imports) - if expression.step is not None - else None, + evaluate_ast(expression.lower, *common_params) if expression.lower is not None else None, + evaluate_ast(expression.upper, *common_params) if expression.upper is not None else None, + evaluate_ast(expression.step, *common_params) if expression.step is not None else None, ) elif isinstance(expression, ast.DictComp): - return evaluate_dictcomp(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_dictcomp(expression, *common_params) elif isinstance(expression, ast.While): - return evaluate_while(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_while(expression, *common_params) elif isinstance(expression, (ast.Import, ast.ImportFrom)): return import_modules(expression, state, authorized_imports) elif isinstance(expression, ast.ClassDef): - return evaluate_class_def(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_class_def(expression, *common_params) elif isinstance(expression, ast.Try): - return evaluate_try(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_try(expression, *common_params) elif isinstance(expression, ast.Raise): - return evaluate_raise(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_raise(expression, *common_params) elif isinstance(expression, ast.Assert): - return evaluate_assert(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_assert(expression, *common_params) elif isinstance(expression, ast.With): - return evaluate_with(expression, state, static_tools, custom_tools, authorized_imports) + return evaluate_with(expression, *common_params) elif isinstance(expression, ast.Set): - return {evaluate_ast(elt, state, static_tools, custom_tools, authorized_imports) for elt in expression.elts} + return set((evaluate_ast(elt, *common_params) for elt in expression.elts)) elif isinstance(expression, ast.Return): - raise ReturnException( - evaluate_ast(expression.value, state, static_tools, custom_tools, authorized_imports) - if expression.value - else None - ) + raise ReturnException(evaluate_ast(expression.value, *common_params) if expression.value else None) elif isinstance(expression, ast.Pass): return None + elif isinstance(expression, ast.Delete): + return evaluate_delete(expression, *common_params) else: # For now we refuse anything else. Let's add things as we need them. raise InterpreterError(f"{expression.__class__.__name__} is not supported.") @@ -1262,13 +1328,13 @@ def evaluate_python_code( state (`Dict[str, Any]`): A dictionary mapping variable names to values. The `state` should contain the initial inputs but will be updated by this function to contain all variables as they are evaluated. - The print outputs will be stored in the state under the key 'print_outputs'. + The print outputs will be stored in the state under the key "_print_outputs". """ try: expression = ast.parse(code) except SyntaxError as e: raise InterpreterError( - f"Code execution failed on line {e.lineno} due to: {type(e).__name__}\n" + f"Code parsing failed on line {e.lineno} due to: {type(e).__name__}\n" f"{e.text}" f"{' ' * (e.offset or 0)}^\n" f"Error: {str(e)}" @@ -1279,10 +1345,7 @@ def evaluate_python_code( static_tools = static_tools.copy() if static_tools is not None else {} custom_tools = custom_tools if custom_tools is not None else {} result = None - global PRINT_OUTPUTS - PRINT_OUTPUTS = "" - global OPERATIONS_COUNT - OPERATIONS_COUNT = 0 + state["_print_outputs"] = PrintContainer() def final_answer(value): raise FinalAnswerException(value) @@ -1292,20 +1355,24 @@ def final_answer(value): try: for node in expression.body: result = evaluate_ast(node, state, static_tools, custom_tools, authorized_imports) - state["print_outputs"] = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length) + state["_print_outputs"].value = truncate_content( + str(state["_print_outputs"]), max_length=max_print_outputs_length + ) is_final_answer = False return result, is_final_answer except FinalAnswerException as e: - state["print_outputs"] = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length) + state["_print_outputs"].value = truncate_content( + str(state["_print_outputs"]), max_length=max_print_outputs_length + ) is_final_answer = True return e.value, is_final_answer except Exception as e: - exception_type = type(e).__name__ - error_msg = truncate_content(PRINT_OUTPUTS, max_length=max_print_outputs_length) - error_msg = ( - f"Code execution failed at line '{ast.get_source_segment(code, node)}' due to: {exception_type}:{str(e)}" + state["_print_outputs"].value = truncate_content( + str(state["_print_outputs"]), max_length=max_print_outputs_length + ) + raise InterpreterError( + f"Code execution failed at line '{ast.get_source_segment(code, node)}' due to: {type(e).__name__}: {e}" ) - raise InterpreterError(error_msg) class LocalPythonInterpreter: @@ -1339,7 +1406,7 @@ def __call__(self, code_action: str, additional_variables: Dict) -> Tuple[Any, s authorized_imports=self.authorized_imports, max_print_outputs_length=self.max_print_outputs_length, ) - logs = self.state["print_outputs"] + logs = str(self.state["_print_outputs"]) return output, logs, is_final_answer diff --git a/src/smolagents/memory.py b/src/smolagents/memory.py new file mode 100644 index 000000000..5875db596 --- /dev/null +++ b/src/smolagents/memory.py @@ -0,0 +1,234 @@ +from dataclasses import asdict, dataclass +from logging import getLogger +from typing import TYPE_CHECKING, Any, Dict, List, TypedDict, Union + +from smolagents.models import ChatMessage, MessageRole +from smolagents.monitoring import AgentLogger, LogLevel +from smolagents.utils import AgentError, make_json_serializable + + +if TYPE_CHECKING: + from smolagents.models import ChatMessage + from smolagents.monitoring import AgentLogger + + +logger = getLogger(__name__) + + +class Message(TypedDict): + role: MessageRole + content: str | list[dict] + + +@dataclass +class ToolCall: + name: str + arguments: Any + id: str + + def dict(self): + return { + "id": self.id, + "type": "function", + "function": { + "name": self.name, + "arguments": make_json_serializable(self.arguments), + }, + } + + +@dataclass +class MemoryStep: + def dict(self): + return asdict(self) + + def to_messages(self, **kwargs) -> List[Dict[str, Any]]: + raise NotImplementedError + + +@dataclass +class ActionStep(MemoryStep): + model_input_messages: List[Message] | None = None + tool_calls: List[ToolCall] | None = None + start_time: float | None = None + end_time: float | None = None + step_number: int | None = None + error: AgentError | None = None + duration: float | None = None + model_output_message: ChatMessage = None + model_output: str | None = None + observations: str | None = None + observations_images: List[str] | None = None + action_output: Any = None + + def dict(self): + # We overwrite the method to parse the tool_calls and action_output manually + return { + "model_input_messages": self.model_input_messages, + "tool_calls": [tc.dict() for tc in self.tool_calls] if self.tool_calls else [], + "start_time": self.start_time, + "end_time": self.end_time, + "step": self.step_number, + "error": self.error.dict() if self.error else None, + "duration": self.duration, + "model_output_message": self.model_output_message, + "model_output": self.model_output, + "observations": self.observations, + "action_output": make_json_serializable(self.action_output), + } + + def to_messages(self, summary_mode: bool = False, show_model_input_messages: bool = False) -> List[Message]: + messages = [] + if self.model_input_messages is not None and show_model_input_messages: + messages.append(Message(role=MessageRole.SYSTEM, content=self.model_input_messages)) + if self.model_output is not None and not summary_mode: + messages.append( + Message(role=MessageRole.ASSISTANT, content=[{"type": "text", "text": self.model_output.strip()}]) + ) + + if self.tool_calls is not None: + messages.append( + Message( + role=MessageRole.ASSISTANT, + content=[ + { + "type": "text", + "text": "Calling tools:\n" + str([tc.dict() for tc in self.tool_calls]), + } + ], + ) + ) + + if self.observations is not None: + messages.append( + Message( + role=MessageRole.TOOL_RESPONSE, + content=[ + { + "type": "text", + "text": f"Call id: {self.tool_calls[0].id}\nObservation:\n{self.observations}", + } + ], + ) + ) + if self.error is not None: + error_message = ( + "Error:\n" + + str(self.error) + + "\nNow let's retry: take care not to repeat previous errors! If you have retried several times, try a completely different approach.\n" + ) + message_content = f"Call id: {self.tool_calls[0].id}\n" if self.tool_calls else "" + message_content += error_message + messages.append( + Message(role=MessageRole.TOOL_RESPONSE, content=[{"type": "text", "text": message_content}]) + ) + + if self.observations_images: + messages.append( + Message( + role=MessageRole.USER, + content=[{"type": "text", "text": "Here are the observed images:"}] + + [ + { + "type": "image", + "image": image, + } + for image in self.observations_images + ], + ) + ) + return messages + + +@dataclass +class PlanningStep(MemoryStep): + model_input_messages: List[Message] + model_output_message_facts: ChatMessage + facts: str + model_output_message_plan: ChatMessage + plan: str + + def to_messages(self, summary_mode: bool, **kwargs) -> List[Message]: + messages = [] + messages.append( + Message( + role=MessageRole.ASSISTANT, content=[{"type": "text", "text": f"[FACTS LIST]:\n{self.facts.strip()}"}] + ) + ) + + if not summary_mode: # This step is not shown to a model writing a plan to avoid influencing the new plan + messages.append( + Message( + role=MessageRole.ASSISTANT, content=[{"type": "text", "text": f"[PLAN]:\n{self.plan.strip()}"}] + ) + ) + return messages + + +@dataclass +class TaskStep(MemoryStep): + task: str + task_images: List[str] | None = None + + def to_messages(self, summary_mode: bool = False, **kwargs) -> List[Message]: + content = [{"type": "text", "text": f"New task:\n{self.task}"}] + if self.task_images: + for image in self.task_images: + content.append({"type": "image", "image": image}) + + return [Message(role=MessageRole.USER, content=content)] + + +@dataclass +class SystemPromptStep(MemoryStep): + system_prompt: str + + def to_messages(self, summary_mode: bool = False, **kwargs) -> List[Message]: + if summary_mode: + return [] + return [Message(role=MessageRole.SYSTEM, content=[{"type": "text", "text": self.system_prompt}])] + + +class AgentMemory: + def __init__(self, system_prompt: str): + self.system_prompt = SystemPromptStep(system_prompt=system_prompt) + self.steps: List[Union[TaskStep, ActionStep, PlanningStep]] = [] + + def reset(self): + self.steps = [] + + def get_succinct_steps(self) -> list[dict]: + return [ + {key: value for key, value in step.dict().items() if key != "model_input_messages"} for step in self.steps + ] + + def get_full_steps(self) -> list[dict]: + return [step.dict() for step in self.steps] + + def replay(self, logger: AgentLogger, detailed: bool = False): + """Prints a pretty replay of the agent's steps. + + Args: + logger (AgentLogger): The logger to print replay logs to. + detailed (bool, optional): If True, also displays the memory at each step. Defaults to False. + Careful: will increase log length exponentially. Use only for debugging. + """ + logger.console.log("Replaying the agent's steps:") + for step in self.steps: + if isinstance(step, SystemPromptStep) and detailed: + logger.log_markdown(title="System prompt", content=step.system_prompt, level=LogLevel.ERROR) + elif isinstance(step, TaskStep): + logger.log_task(step.task, "", level=LogLevel.ERROR) + elif isinstance(step, ActionStep): + logger.log_rule(f"Step {step.step_number}", level=LogLevel.ERROR) + if detailed: + logger.log_messages(step.model_input_messages) + logger.log_markdown(title="Agent output:", content=step.model_output, level=LogLevel.ERROR) + elif isinstance(step, PlanningStep): + logger.log_rule("Planning step", level=LogLevel.ERROR) + if detailed: + logger.log_messages(step.model_input_messages, level=LogLevel.ERROR) + logger.log_markdown(title="Agent output:", content=step.facts + "\n" + step.plan, level=LogLevel.ERROR) + + +__all__ = ["AgentMemory"] diff --git a/src/smolagents/models.py b/src/smolagents/models.py index eb613dffc..2a586edfe 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -18,19 +18,15 @@ import logging import os import random +import uuid from copy import deepcopy from dataclasses import asdict, dataclass from enum import Enum from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union from huggingface_hub import InferenceClient +from huggingface_hub.utils import is_torch_available from PIL import Image -from transformers import ( - AutoModelForImageTextToText, - AutoProcessor, - StoppingCriteriaList, - is_torch_available, -) from .tools import Tool from .utils import _is_package_available, encode_image_base64, make_image_url @@ -52,10 +48,10 @@ } -def get_dict_from_nested_dataclasses(obj): +def get_dict_from_nested_dataclasses(obj, ignore_key=None): def convert(obj): if hasattr(obj, "__dataclass_fields__"): - return {k: convert(v) for k, v in asdict(obj).items()} + return {k: convert(v) for k, v in asdict(obj).items() if k != ignore_key} return obj return convert(obj) @@ -96,16 +92,17 @@ class ChatMessage: role: str content: Optional[str] = None tool_calls: Optional[List[ChatMessageToolCall]] = None + raw: Optional[Any] = None # Stores the raw output from the API def model_dump_json(self): - return json.dumps(get_dict_from_nested_dataclasses(self)) + return json.dumps(get_dict_from_nested_dataclasses(self, ignore_key="raw")) @classmethod - def from_hf_api(cls, message) -> "ChatMessage": + def from_hf_api(cls, message, raw) -> "ChatMessage": tool_calls = None if getattr(message, "tool_calls", None) is not None: tool_calls = [ChatMessageToolCall.from_hf_api(tool_call) for tool_call in message.tool_calls] - return cls(role=message.role, content=message.content, tool_calls=tool_calls) + return cls(role=message.role, content=message.content, tool_calls=tool_calls, raw=raw) @classmethod def from_dict(cls, data: dict) -> "ChatMessage": @@ -119,6 +116,9 @@ def from_dict(cls, data: dict) -> "ChatMessage": data["tool_calls"] = tool_calls return cls(**data) + def dict(self): + return json.dumps(get_dict_from_nested_dataclasses(self)) + def parse_json_if_needed(arguments: Union[str, dict]) -> Union[str, dict]: if isinstance(arguments, dict): @@ -131,8 +131,9 @@ def parse_json_if_needed(arguments: Union[str, dict]) -> Union[str, dict]: def parse_tool_args_if_needed(message: ChatMessage) -> ChatMessage: - for tool_call in message.tool_calls: - tool_call.function.arguments = parse_json_if_needed(tool_call.function.arguments) + if message.tool_calls is not None: + for tool_call in message.tool_calls: + tool_call.function.arguments = parse_json_if_needed(tool_call.function.arguments) return message @@ -210,16 +211,18 @@ def get_clean_message_list( message["role"] = role_conversions[role] # encode images if needed if isinstance(message["content"], list): - for i, element in enumerate(message["content"]): + for element in message["content"]: if element["type"] == "image": assert not flatten_messages_as_text, f"Cannot use images with {flatten_messages_as_text=}" if convert_images_to_image_urls: - message["content"][i] = { - "type": "image_url", - "image_url": {"url": make_image_url(encode_image_base64(element["image"]))}, - } + element.update( + { + "type": "image_url", + "image_url": {"url": make_image_url(encode_image_base64(element.pop("image")))}, + } + ) else: - message["content"][i]["image"] = encode_image_base64(element["image"]) + element["image"] = encode_image_base64(element["image"]) if len(output_message_list) > 0 and message["role"] == output_message_list[-1]["role"]: assert isinstance(message["content"], list), "Error: wrong content:" + str(message["content"]) @@ -240,8 +243,6 @@ class Model: def __init__(self, **kwargs): self.last_input_token_count = None self.last_output_token_count = None - # Set default values for common parameters - kwargs.setdefault("max_tokens", 4096) self.kwargs = kwargs def _prepare_completion_kwargs( @@ -330,6 +331,53 @@ def __call__( """ pass # To be implemented in child classes! + def to_dict(self) -> Dict: + """ + Converts the model into a JSON-compatible dictionary. + """ + model_dictionary = { + **self.kwargs, + "last_input_token_count": self.last_input_token_count, + "last_output_token_count": self.last_output_token_count, + "model_id": self.model_id, + } + for attribute in [ + "custom_role_conversion", + "temperature", + "max_tokens", + "provider", + "timeout", + "api_base", + "torch_dtype", + "device_map", + "organization", + "project", + "azure_endpoint", + ]: + if hasattr(self, attribute): + model_dictionary[attribute] = getattr(self, attribute) + + dangerous_attributes = ["token", "api_key"] + for attribute_name in dangerous_attributes: + if hasattr(self, attribute_name): + print( + f"For security reasons, we do not export the `{attribute_name}` attribute of your model. Please export it manually." + ) + return model_dictionary + + @classmethod + def from_dict(cls, model_dictionary: Dict[str, Any]) -> "Model": + model_instance = cls( + **{ + k: v + for k, v in model_dictionary.items() + if k not in ["last_input_token_count", "last_output_token_count"] + } + ) + model_instance.last_input_token_count = model_dictionary.pop("last_input_token_count", None) + model_instance.last_output_token_count = model_dictionary.pop("last_output_token_count", None) + return model_instance + class HfApiModel(Model): """A class to interact with Hugging Face's Inference API for language model interaction. @@ -339,12 +387,18 @@ class HfApiModel(Model): Parameters: model_id (`str`, *optional*, defaults to `"Qwen/Qwen2.5-Coder-32B-Instruct"`): The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. + provider (`str`, *optional*): + Name of the provider to use for inference. Can be `"replicate"`, `"together"`, `"fal-ai"`, `"sambanova"` or `"hf-inference"`. + defaults to hf-inference (HF Inference API). token (`str`, *optional*): Token used by the Hugging Face API for authentication. This token need to be authorized 'Make calls to the serverless Inference API'. If the model is gated (like Llama-3 models), the token also needs 'Read access to contents of all public gated repos you can access'. If not provided, the class will try to use environment variable 'HF_TOKEN', else use the token stored in the Hugging Face CLI configuration. timeout (`int`, *optional*, defaults to 120): Timeout for the API request, in seconds. + custom_role_conversions (`dict[str, str]`, *optional*): + Custom role conversion mapping to convert message roles in others. + Useful for specific models that do not support specific message roles like "system". **kwargs: Additional keyword arguments to pass to the Hugging Face API. @@ -369,15 +423,19 @@ class HfApiModel(Model): def __init__( self, model_id: str = "Qwen/Qwen2.5-Coder-32B-Instruct", + provider: Optional[str] = None, token: Optional[str] = None, timeout: Optional[int] = 120, + custom_role_conversions: Optional[Dict[str, str]] = None, **kwargs, ): super().__init__(**kwargs) self.model_id = model_id + self.provider = provider if token is None: token = os.getenv("HF_TOKEN") - self.client = InferenceClient(self.model_id, token=token, timeout=timeout) + self.client = InferenceClient(self.model_id, provider=provider, token=token, timeout=timeout) + self.custom_role_conversions = custom_role_conversions def __call__( self, @@ -393,23 +451,146 @@ def __call__( grammar=grammar, tools_to_call_from=tools_to_call_from, convert_images_to_image_urls=True, + custom_role_conversions=self.custom_role_conversions, **kwargs, ) - response = self.client.chat_completion(**completion_kwargs) self.last_input_token_count = response.usage.prompt_tokens self.last_output_token_count = response.usage.completion_tokens - message = ChatMessage.from_hf_api(response.choices[0].message) + message = ChatMessage.from_hf_api(response.choices[0].message, raw=response) if tools_to_call_from is not None: return parse_tool_args_if_needed(message) return message +class MLXModel(Model): + """A class to interact with models loaded using MLX on Apple silicon. + + > [!TIP] + > You must have `mlx-lm` installed on your machine. Please run `pip install smolagents[mlx-lm]` if it's not the case. + + Parameters: + model_id (str): + The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. + tool_name_key (str): + The key, which can usually be found in the model's chat template, for retrieving a tool name. + tool_arguments_key (str): + The key, which can usually be found in the model's chat template, for retrieving tool arguments. + trust_remote_code (bool): + Some models on the Hub require running remote code: for this model, you would have to set this flag to True. + kwargs (dict, *optional*): + Any additional keyword arguments that you want to use in model.generate(), for instance `max_tokens`. + + Example: + ```python + >>> engine = MLXModel( + ... model_id="mlx-community/Qwen2.5-Coder-32B-Instruct-4bit", + ... max_tokens=10000, + ... ) + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "text", "text": "Explain quantum mechanics in simple terms."} + ... ] + ... } + ... ] + >>> response = engine(messages, stop_sequences=["END"]) + >>> print(response) + "Quantum mechanics is the branch of physics that studies..." + ``` + """ + + def __init__( + self, + model_id: str, + tool_name_key: str = "name", + tool_arguments_key: str = "arguments", + trust_remote_code: bool = False, + **kwargs, + ): + super().__init__(**kwargs) + if not _is_package_available("mlx_lm"): + raise ModuleNotFoundError( + "Please install 'mlx-lm' extra to use 'MLXModel': `pip install 'smolagents[mlx-lm]'`" + ) + import mlx_lm + + self.model_id = model_id + self.model, self.tokenizer = mlx_lm.load(model_id, tokenizer_config={"trust_remote_code": trust_remote_code}) + self.stream_generate = mlx_lm.stream_generate + self.tool_name_key = tool_name_key + self.tool_arguments_key = tool_arguments_key + + def _to_message(self, text, tools_to_call_from): + if tools_to_call_from: + # solution for extracting tool JSON without assuming a specific model output format + maybe_json = "{" + text.split("{", 1)[-1][::-1].split("}", 1)[-1][::-1] + "}" + parsed_text = json.loads(maybe_json) + tool_name = parsed_text.get(self.tool_name_key, None) + tool_arguments = parsed_text.get(self.tool_arguments_key, None) + if tool_name: + return ChatMessage( + role="assistant", + content="", + tool_calls=[ + ChatMessageToolCall( + id=uuid.uuid4(), + type="function", + function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments), + ) + ], + ) + return ChatMessage(role="assistant", content=text) + + def __call__( + self, + messages: List[Dict[str, str]], + stop_sequences: Optional[List[str]] = None, + grammar: Optional[str] = None, + tools_to_call_from: Optional[List[Tool]] = None, + **kwargs, + ) -> ChatMessage: + completion_kwargs = self._prepare_completion_kwargs( + flatten_messages_as_text=True, # mlx-lm doesn't support vision models + messages=messages, + stop_sequences=stop_sequences, + grammar=grammar, + tools_to_call_from=tools_to_call_from, + **kwargs, + ) + messages = completion_kwargs.pop("messages") + prepared_stop_sequences = completion_kwargs.pop("stop", []) + tools = completion_kwargs.pop("tools", None) + completion_kwargs.pop("tool_choice", None) + + prompt_ids = self.tokenizer.apply_chat_template( + messages, + tools=tools, + add_generation_prompt=True, + ) + + self.last_input_token_count = len(prompt_ids) + self.last_output_token_count = 0 + text = "" + + for _ in self.stream_generate(self.model, self.tokenizer, prompt=prompt_ids, **completion_kwargs): + self.last_output_token_count += 1 + text += _.text + for stop_sequence in prepared_stop_sequences: + stop_sequence_start = text.rfind(stop_sequence) + if stop_sequence_start != -1: + text = text[:stop_sequence_start] + return self._to_message(text, tools_to_call_from) + + return self._to_message(text, tools_to_call_from) + + class TransformersModel(Model): - """A class to interact with Hugging Face's Inference API for language model interaction. + """A class that uses Hugging Face's Transformers library for language model interaction. - This model allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization. + This model allows you to load and use Hugging Face's models locally using the Transformers library. It supports features like stop sequences and grammar customization. > [!TIP] > You must have `transformers` and `torch` installed on your machine. Please run `pip install smolagents[transformers]` if it's not the case. @@ -423,9 +604,6 @@ class TransformersModel(Model): The torch_dtype to initialize your model with. trust_remote_code (bool, default `False`): Some models on the Hub require running remote code: for this model, you would have to set this flag to True. - flatten_messages_as_text (`bool`, default `True`): - Whether to flatten messages as text: this must be sent to False to use VLMs (as opposed to LLMs for which this flag can be ignored). - Caution: this parameter is experimental and will be removed in an upcoming PR as we auto-detect VLMs. kwargs (dict, *optional*): Any additional keyword arguments that you want to use in model.generate(), for instance `max_new_tokens` or `device`. **kwargs: @@ -454,7 +632,6 @@ def __init__( device_map: Optional[str] = None, torch_dtype: Optional[str] = None, trust_remote_code: bool = False, - flatten_messages_as_text: bool = True, **kwargs, ): super().__init__(**kwargs) @@ -463,17 +640,27 @@ def __init__( "Please install 'transformers' extra to use 'TransformersModel': `pip install 'smolagents[transformers]'`" ) import torch - from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers import AutoModelForCausalLM, AutoModelForImageTextToText, AutoProcessor, AutoTokenizer default_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct" if model_id is None: model_id = default_model_id logger.warning(f"`model_id`not provided, using this default tokenizer for token counts: '{model_id}'") self.model_id = model_id + + default_max_tokens = 5000 + max_new_tokens = kwargs.get("max_new_tokens") or kwargs.get("max_tokens") + if not max_new_tokens: + kwargs["max_new_tokens"] = default_max_tokens + logger.warning( + f"`max_new_tokens` not provided, using this default value for `max_new_tokens`: {default_max_tokens}" + ) self.kwargs = kwargs + if device_map is None: device_map = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device_map}") + self._is_vlm = False try: self.model = AutoModelForCausalLM.from_pretrained( model_id, @@ -486,6 +673,7 @@ def __init__( if "Unrecognized configuration class" in str(e): self.model = AutoModelForImageTextToText.from_pretrained(model_id, device_map=device_map) self.processor = AutoProcessor.from_pretrained(model_id) + self._is_vlm = True else: raise e except Exception as e: @@ -495,7 +683,6 @@ def __init__( self.model_id = default_model_id self.tokenizer = AutoTokenizer.from_pretrained(default_model_id) self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map, torch_dtype=torch_dtype) - self.flatten_messages_as_text = flatten_messages_as_text def make_stopping_criteria(self, stop_sequences: List[str], tokenizer) -> "StoppingCriteriaList": from transformers import StoppingCriteria, StoppingCriteriaList @@ -531,8 +718,7 @@ def __call__( messages=messages, stop_sequences=stop_sequences, grammar=grammar, - tools_to_call_from=tools_to_call_from, - flatten_messages_as_text=self.flatten_messages_as_text, + flatten_messages_as_text=(not self._is_vlm), **kwargs, ) @@ -596,13 +782,27 @@ def __call__( output = remove_stop_sequences(output, stop_sequences) if tools_to_call_from is None: - return ChatMessage(role="assistant", content=output) + return ChatMessage( + role="assistant", + content=output, + raw={"out": out, "completion_kwargs": completion_kwargs}, + ) else: if "Action:" in output: output = output.split("Action:", 1)[1].strip() - parsed_output = json.loads(output) - tool_name = parsed_output.get("tool_name") - tool_arguments = parsed_output.get("tool_arguments") + try: + start_index = output.index("{") + end_index = output.rindex("}") + output = output[start_index : end_index + 1] + except Exception as e: + raise Exception("No json blob found in output!") from e + + try: + parsed_output = json.loads(output) + except json.JSONDecodeError as e: + raise ValueError(f"Tool call '{output}' has an invalid JSON structure: {e}") + tool_name = parsed_output.get("name") + tool_arguments = parsed_output.get("arguments") return ChatMessage( role="assistant", content="", @@ -613,6 +813,7 @@ def __call__( function=ChatMessageToolCallDefinition(name=tool_name, arguments=tool_arguments), ) ], + raw={"out": out, "completion_kwargs": completion_kwargs}, ) @@ -626,30 +827,31 @@ class LiteLLMModel(Model): The base URL of the OpenAI-compatible API server. api_key (`str`, *optional*): The API key to use for authentication. + custom_role_conversions (`dict[str, str]`, *optional*): + Custom role conversion mapping to convert message roles in others. + Useful for specific models that do not support specific message roles like "system". **kwargs: Additional keyword arguments to pass to the OpenAI API. """ def __init__( self, - model_id="anthropic/claude-3-5-sonnet-20240620", + model_id: str = "anthropic/claude-3-5-sonnet-20240620", api_base=None, api_key=None, + custom_role_conversions: Optional[Dict[str, str]] = None, **kwargs, ): - try: - import litellm - except ModuleNotFoundError: - raise ModuleNotFoundError( - "Please install 'litellm' extra to use LiteLLMModel: `pip install 'smolagents[litellm]'`" - ) - super().__init__(**kwargs) self.model_id = model_id - # IMPORTANT - Set this to TRUE to add the function to the prompt for Non OpenAI LLMs - litellm.add_function_to_prompt = True self.api_base = api_base self.api_key = api_key + self.custom_role_conversions = custom_role_conversions + self.flatten_messages_as_text = ( + kwargs.get("flatten_messages_as_text") + if "flatten_messages_as_text" in kwargs + else self.model_id.startswith(("ollama", "groq", "cerebras")) + ) def __call__( self, @@ -659,7 +861,12 @@ def __call__( tools_to_call_from: Optional[List[Tool]] = None, **kwargs, ) -> ChatMessage: - import litellm + try: + import litellm + except ModuleNotFoundError: + raise ModuleNotFoundError( + "Please install 'litellm' extra to use LiteLLMModel: `pip install 'smolagents[litellm]'`" + ) completion_kwargs = self._prepare_completion_kwargs( messages=messages, @@ -670,6 +877,8 @@ def __call__( api_base=self.api_base, api_key=self.api_key, convert_images_to_image_urls=True, + flatten_messages_as_text=self.flatten_messages_as_text, + custom_role_conversions=self.custom_role_conversions, **kwargs, ) @@ -677,10 +886,10 @@ def __call__( self.last_input_token_count = response.usage.prompt_tokens self.last_output_token_count = response.usage.completion_tokens - message = ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}) ) + message.raw = response if tools_to_call_from is not None: return parse_tool_args_if_needed(message) @@ -697,6 +906,12 @@ class OpenAIServerModel(Model): The base URL of the OpenAI-compatible API server. api_key (`str`, *optional*): The API key to use for authentication. + organization (`str`, *optional*): + The organization to use for the API request. + project (`str`, *optional*): + The project to use for the API request. + client_kwargs (`dict[str, Any]`, *optional*): + Additional keyword arguments to pass to the OpenAI client (like organization, project, max_retries etc.). custom_role_conversions (`dict[str, str]`, *optional*): Custom role conversion mapping to convert message roles in others. Useful for specific models that do not support specific message roles like "system". @@ -709,6 +924,9 @@ def __init__( model_id: str, api_base: Optional[str] = None, api_key: Optional[str] = None, + organization: Optional[str] | None = None, + project: Optional[str] | None = None, + client_kwargs: Optional[Dict[str, Any]] = None, custom_role_conversions: Optional[Dict[str, str]] = None, **kwargs, ): @@ -724,6 +942,9 @@ def __init__( self.client = openai.OpenAI( base_url=api_base, api_key=api_key, + organization=organization, + project=project, + **(client_kwargs or {}), ) self.custom_role_conversions = custom_role_conversions @@ -745,7 +966,6 @@ def __call__( convert_images_to_image_urls=True, **kwargs, ) - response = self.client.chat.completions.create(**completion_kwargs) self.last_input_token_count = response.usage.prompt_tokens self.last_output_token_count = response.usage.completion_tokens @@ -753,6 +973,7 @@ def __call__( message = ChatMessage.from_dict( response.choices[0].message.model_dump(include={"role", "content", "tool_calls"}) ) + message.raw = response if tools_to_call_from is not None: return parse_tool_args_if_needed(message) return message @@ -802,6 +1023,7 @@ def __init__( "tool_role_conversions", "get_clean_message_list", "Model", + "MLXModel", "TransformersModel", "HfApiModel", "LiteLLMModel", diff --git a/src/smolagents/monitoring.py b/src/smolagents/monitoring.py index 59f43f443..d7deb4403 100644 --- a/src/smolagents/monitoring.py +++ b/src/smolagents/monitoring.py @@ -14,7 +14,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json +from enum import IntEnum +from typing import List, Optional + +from rich import box +from rich.console import Console, Group +from rich.panel import Panel +from rich.rule import Rule +from rich.syntax import Syntax +from rich.table import Table from rich.text import Text +from rich.tree import Tree + + +__all__ = ["AgentLogger", "LogLevel", "Monitor"] class Monitor: @@ -41,11 +55,11 @@ def update_metrics(self, step_log): """Update the metrics of the monitor. Args: - step_log ([`AgentStepLog`]): Step log to update the monitor with. + step_log ([`MemoryStep`]): Step log to update the monitor with. """ step_duration = step_log.duration self.step_durations.append(step_duration) - console_outputs = f"[Step {len(self.step_durations) - 1}: Duration {step_duration:.2f} seconds" + console_outputs = f"[Step {len(self.step_durations)}: Duration {step_duration:.2f} seconds" if getattr(self.tracked_model, "last_input_token_count", None) is not None: self.total_input_token_count += self.tracked_model.last_input_token_count @@ -57,4 +71,142 @@ def update_metrics(self, step_log): self.logger.log(Text(console_outputs, style="dim"), level=1) -__all__ = ["Monitor"] +class LogLevel(IntEnum): + OFF = -1 # No output + ERROR = 0 # Only errors + INFO = 1 # Normal output (default) + DEBUG = 2 # Detailed output + + +YELLOW_HEX = "#d4b702" + + +class AgentLogger: + def __init__(self, level: LogLevel = LogLevel.INFO): + self.level = level + self.console = Console() + + def log(self, *args, level: str | LogLevel = LogLevel.INFO, **kwargs) -> None: + """Logs a message to the console. + + Args: + level (LogLevel, optional): Defaults to LogLevel.INFO. + """ + if isinstance(level, str): + level = LogLevel[level.upper()] + if level <= self.level: + self.console.print(*args, **kwargs) + + def log_markdown(self, content: str, title: Optional[str] = None, level=LogLevel.INFO, style=YELLOW_HEX) -> None: + markdown_content = Syntax( + content, + lexer="markdown", + theme="github-dark", + word_wrap=True, + ) + if title: + self.log( + Group( + Rule( + "[bold italic]" + title, + align="left", + style=style, + ), + markdown_content, + ), + level=level, + ) + else: + self.log(markdown_content, level=level) + + def log_code(self, title: str, content: str, level: int = LogLevel.INFO) -> None: + self.log( + Panel( + Syntax( + content, + lexer="python", + theme="monokai", + word_wrap=True, + ), + title="[bold]" + title, + title_align="left", + box=box.HORIZONTALS, + ), + level=level, + ) + + def log_rule(self, title: str, level: int = LogLevel.INFO) -> None: + self.log( + Rule( + "[bold]" + title, + characters="━", + style=YELLOW_HEX, + ), + level=LogLevel.INFO, + ) + + def log_task(self, content: str, subtitle: str, title: Optional[str] = None, level: int = LogLevel.INFO) -> None: + self.log( + Panel( + f"\n[bold]{content}\n", + title="[bold]New run" + (f" - {title}" if title else ""), + subtitle=subtitle, + border_style=YELLOW_HEX, + subtitle_align="left", + ), + level=level, + ) + + def log_messages(self, messages: List) -> None: + messages_as_string = "\n".join([json.dumps(dict(message), indent=4) for message in messages]) + self.log( + Syntax( + messages_as_string, + lexer="markdown", + theme="github-dark", + word_wrap=True, + ) + ) + + def visualize_agent_tree(self, agent): + def create_tools_section(tools_dict): + table = Table(show_header=True, header_style="bold") + table.add_column("Name", style="#1E90FF") + table.add_column("Description") + table.add_column("Arguments") + + for name, tool in tools_dict.items(): + args = [ + f"{arg_name} (`{info.get('type', 'Any')}`{', optional' if info.get('optional') else ''}): {info.get('description', '')}" + for arg_name, info in getattr(tool, "inputs", {}).items() + ] + table.add_row(name, getattr(tool, "description", str(tool)), "\n".join(args)) + + return Group("🛠️ [italic #1E90FF]Tools:[/italic #1E90FF]", table) + + def get_agent_headline(agent, name: Optional[str] = None): + name_headline = f"{name} | " if name else "" + return f"[bold {YELLOW_HEX}]{name_headline}{agent.__class__.__name__} | {agent.model.model_id}" + + def build_agent_tree(parent_tree, agent_obj): + """Recursively builds the agent tree.""" + parent_tree.add(create_tools_section(agent_obj.tools)) + + if agent_obj.managed_agents: + agents_branch = parent_tree.add("🤖 [italic #1E90FF]Managed agents:") + for name, managed_agent in agent_obj.managed_agents.items(): + agent_tree = agents_branch.add(get_agent_headline(managed_agent, name)) + if managed_agent.__class__.__name__ == "CodeAgent": + agent_tree.add( + f"✅ [italic #1E90FF]Authorized imports:[/italic #1E90FF] {managed_agent.additional_authorized_imports}" + ) + agent_tree.add(f"📝 [italic #1E90FF]Description:[/italic #1E90FF] {managed_agent.description}") + build_agent_tree(agent_tree, managed_agent) + + main_tree = Tree(get_agent_headline(agent)) + if agent.__class__.__name__ == "CodeAgent": + main_tree.add( + f"✅ [italic #1E90FF]Authorized imports:[/italic #1E90FF] {agent.additional_authorized_imports}" + ) + build_agent_tree(main_tree, agent) + self.console.print(main_tree) diff --git a/src/smolagents/prompts.py b/src/smolagents/prompts.py deleted file mode 100644 index ce905c81b..000000000 --- a/src/smolagents/prompts.py +++ /dev/null @@ -1,522 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -SINGLE_STEP_CODE_SYSTEM_PROMPT = """You will be given a task to solve, your job is to come up with a series of simple commands in Python that will perform the task. -To help you, I will give you access to a set of tools that you can use. Each tool is a Python function and has a description explaining the task it performs, the inputs it expects and the outputs it returns. -You should first explain which tool you will use to perform the task and for what reason, then write the code in Python. -Each instruction in Python should be a simple assignment. You can print intermediate results if it makes sense to do so. -In the end, use tool 'final_answer' to return your answer, its argument will be what gets returned. -You can use imports in your code, but only from the following list of modules: <> -Be sure to provide a 'Code:' token, else the run will fail. - -Tools: -{{tool_descriptions}} - -Examples: ---- -Task: -"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French. -You have been provided with these additional arguments, that you can access using the keys as variables in your python code: -{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}" - -Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image. -Code: -```py -translated_question = translator(question=question, src_lang="French", tgt_lang="English") -print(f"The translated question is {translated_question}.") -answer = image_qa(image=image, question=translated_question) -final_answer(f"The answer is {answer}") -``` - ---- -Task: "Identify the oldest person in the `document` and create an image showcasing the result." - -Thought: I will use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. -Code: -```py -answer = document_qa(document, question="What is the oldest person?") -print(f"The answer is {answer}.") -image = image_generator(answer) -final_answer(image) -``` - ---- -Task: "Generate an image using the text given in the variable `caption`." - -Thought: I will use the following tool: `image_generator` to generate an image. -Code: -```py -image = image_generator(prompt=caption) -final_answer(image) -``` - ---- -Task: "Summarize the text given in the variable `text` and read it out loud." - -Thought: I will use the following tools: `summarizer` to create a summary of the input text, then `text_reader` to read it out loud. -Code: -```py -summarized_text = summarizer(text) -print(f"Summary: {summarized_text}") -audio_summary = text_reader(summarized_text) -final_answer(audio_summary) -``` - ---- -Task: "Answer the question in the variable `question` about the text in the variable `text`. Use the answer to generate an image." - -Thought: I will use the following tools: `text_qa` to create the answer, then `image_generator` to generate an image according to the answer. -Code: -```py -answer = text_qa(text=text, question=question) -print(f"The answer is {answer}.") -image = image_generator(answer) -final_answer(image) -``` - ---- -Task: "Caption the following `image`." - -Thought: I will use the following tool: `image_captioner` to generate a caption for the image. -Code: -```py -caption = image_captioner(image) -final_answer(caption) -``` - ---- -Above example were using tools that might not exist for you. You only have access to these tools: -{{tool_names}} - -{{managed_agents_descriptions}} - -Remember to make sure that variables you use are all defined. In particular don't import packages! -Be sure to provide a 'Code:\n```' sequence before the code and '```' after, else you will get an error. -DO NOT pass the arguments as a dict as in 'answer = ask_search_agent({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = ask_search_agent(query="What is the place where James Bond lives?")'. - -Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. -""" - - -TOOL_CALLING_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using tool calls. You will be given a task to solve as best you can. -To do so, you have been given access to the following tools: {{tool_names}} - -The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation". -This Action/Observation can repeat N times, you should take several steps when needed. - -You can use the result of the previous action as input for the next action. -The observation will always be a string: it can represent a file, like "image_1.jpg". -Then you can use it as input for the next action. You can do it for instance as follows: - -Observation: "image_1.jpg" - -Action: -{ - "name": "image_transformer", - "arguments": {"image": "image_1.jpg"} -} - -To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this: -Action: -{ - "name": "final_answer", - "arguments": {"answer": "insert your final answer here"} -} - - -Here are a few examples using notional tools: ---- -Task: "Generate an image of the oldest person in this document." - -Action: -{ - "name": "document_qa", - "arguments": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"} -} -Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland." - -Action: -{ - "name": "image_generator", - "arguments": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."} -} -Observation: "image.png" - -Action: -{ - "name": "final_answer", - "arguments": "image.png" -} - ---- -Task: "What is the result of the following operation: 5 + 3 + 1294.678?" - -Action: -{ - "name": "python_interpreter", - "arguments": {"code": "5 + 3 + 1294.678"} -} -Observation: 1302.678 - -Action: -{ - "name": "final_answer", - "arguments": "1302.678" -} - ---- -Task: "Which city has the highest population , Guangzhou or Shanghai?" - -Action: -{ - "name": "search", - "arguments": "Population Guangzhou" -} -Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.'] - - -Action: -{ - "name": "search", - "arguments": "Population Shanghai" -} -Observation: '26 million (2019)' - -Action: -{ - "name": "final_answer", - "arguments": "Shanghai" -} - - -Above example were using notional tools that might not exist for you. You only have access to these tools: - -{{tool_descriptions}} - -{{managed_agents_descriptions}} - -Here are the rules you should always follow to solve your task: -1. ALWAYS provide a tool call, else you will fail. -2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead. -3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself. -If no tool call is needed, use final_answer tool to return your answer. -4. Never re-do a tool call that you previously did with the exact same parameters. - -Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. -""" - -CODE_SYSTEM_PROMPT = """You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can. -To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code. -To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. - -At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use. -Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '' sequence. -During each intermediate step, you can use 'print()' to save whatever important information you will then need. -These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step. -In the end you have to return a final answer using the `final_answer` tool. - -Here are a few examples using notional tools: ---- -Task: "Generate an image of the oldest person in this document." - -Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. -Code: -```py -answer = document_qa(document=document, question="Who is the oldest person mentioned?") -print(answer) -``` -Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland." - -Thought: I will now generate an image showcasing the oldest person. -Code: -```py -image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.") -final_answer(image) -``` - ---- -Task: "What is the result of the following operation: 5 + 3 + 1294.678?" - -Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool -Code: -```py -result = 5 + 3 + 1294.678 -final_answer(result) -``` - ---- -Task: -"Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French. -You have been provided with these additional arguments, that you can access using the keys as variables in your python code: -{'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}" - -Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image. -Code: -```py -translated_question = translator(question=question, src_lang="French", tgt_lang="English") -print(f"The translated question is {translated_question}.") -answer = image_qa(image=image, question=translated_question) -final_answer(f"The answer is {answer}") -``` - ---- -Task: -In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer. -What does he say was the consequence of Einstein learning too much math on his creativity, in one word? - -Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin. -Code: -```py -pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein") -print(pages) -``` -Observation: -No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein". - -Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query. -Code: -```py -pages = search(query="1979 interview Stanislaus Ulam") -print(pages) -``` -Observation: -Found 6 pages: -[Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/) - -[Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/) - -(truncated) - -Thought: I will read the first 2 pages to know more. -Code: -```py -for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]: - whole_page = visit_webpage(url) - print(whole_page) - print("\n" + "="*80 + "\n") # Print separator between pages -``` -Observation: -Manhattan Project Locations: -Los Alamos, NM -Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at -(truncated) - -Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word. -Code: -```py -final_answer("diminished") -``` - ---- -Task: "Which city has the highest population: Guangzhou or Shanghai?" - -Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities. -Code: -```py -for city in ["Guangzhou", "Shanghai"]: - print(f"Population {city}:", search(f"{city} population") -``` -Observation: -Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.'] -Population Shanghai: '26 million (2019)' - -Thought: Now I know that Shanghai has the highest population. -Code: -```py -final_answer("Shanghai") -``` - ---- -Task: "What is the current age of the pope, raised to the power 0.36?" - -Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search. -Code: -```py -pope_age_wiki = wiki(query="current pope age") -print("Pope age as per wikipedia:", pope_age_wiki) -pope_age_search = web_search(query="current pope age") -print("Pope age as per google search:", pope_age_search) -``` -Observation: -Pope age: "The pope Francis is currently 88 years old." - -Thought: I know that the pope is 88 years old. Let's compute the result using python code. -Code: -```py -pope_current_age = 88 ** 0.36 -final_answer(pope_current_age) -``` - -Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools: - -{{tool_descriptions}} - -{{managed_agents_descriptions}} - -Here are the rules you should always follow to solve your task: -1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```' sequence, else you will fail. -2. Use only variables that you have defined! -3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'. -4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block. -5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters. -6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'. -7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables. -8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}} -9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist. -10. Don't give up! You're in charge of solving the task, not providing directions to solve it. - -Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. -""" - -SYSTEM_PROMPT_FACTS = """Below I will present you a task. - -You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need. -To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it. -Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey: - ---- -### 1. Facts given in the task -List here the specific facts given in the task that could help you (there might be nothing here). - -### 2. Facts to look up -List here any facts that we may need to look up. -Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here. - -### 3. Facts to derive -List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation. - -Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings: -### 1. Facts given in the task -### 2. Facts to look up -### 3. Facts to derive -Do not add anything else.""" - -SYSTEM_PROMPT_PLAN = """You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. - -Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. -This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. -Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. -After writing the final step of the plan, write the '\n' tag and stop there.""" - -USER_PROMPT_PLAN = """ -Here is your task: - -Task: -``` -{task} -``` - -Your plan can leverage any of these tools: -{tool_descriptions} - -{managed_agents_descriptions} - -List of facts that you know: -``` -{answer_facts} -``` - -Now begin! Write your plan below.""" - -SYSTEM_PROMPT_FACTS_UPDATE = """ -You are a world expert at gathering known and unknown facts based on a conversation. -Below you will find a task, and ahistory of attempts made to solve the task. You will have to produce a list of these: -### 1. Facts given in the task -### 2. Facts that we have learned -### 3. Facts still to look up -### 4. Facts still to derive -Find the task and history below.""" - -USER_PROMPT_FACTS_UPDATE = """Earlier we've built a list of facts. -But since in your previous steps you may have learned useful new facts or invalidated some false ones. -Please update your list of facts based on the previous history, and provide these headings: -### 1. Facts given in the task -### 2. Facts that we have learned -### 3. Facts still to look up -### 4. Facts still to derive - -Now write your new list of facts below.""" - -SYSTEM_PROMPT_PLAN_UPDATE = """You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. - -You have been given a task: -``` -{task} -``` - -Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task. -If the previous tries so far have met some success, you can make an updated plan based on these actions. -If you are stalled, you can make a completely new plan starting from scratch. -""" - -USER_PROMPT_PLAN_UPDATE = """You're still working towards solving this task: -``` -{task} -``` - -You have access to these tools and only these: -{tool_descriptions} - -{managed_agents_descriptions} - -Here is the up to date list of facts that you know: -``` -{facts_update} -``` - -Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. -This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. -Beware that you have {remaining_steps} steps remaining. -Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. -After writing the final step of the plan, write the '\n' tag and stop there. - -Now write your new plan below.""" - -PLAN_UPDATE_FINAL_PLAN_REDACTION = """I still need to solve the task I was given: -``` -{task} -``` - -Here is my new/updated plan of action to solve the task: -``` -{plan_update} -```""" - -MANAGED_AGENT_PROMPT = """You're a helpful agent named '{name}'. -You have been submitted this task by your manager. ---- -Task: -{task} ---- -You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer. - -Your final_answer WILL HAVE to contain these parts: -### 1. Task outcome (short version): -### 2. Task outcome (extremely detailed version): -### 3. Additional context (if relevant): - -Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost. -And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback. -{{additional_prompting}}""" - -__all__ = [ - "USER_PROMPT_PLAN_UPDATE", - "PLAN_UPDATE_FINAL_PLAN_REDACTION", - "SINGLE_STEP_CODE_SYSTEM_PROMPT", - "CODE_SYSTEM_PROMPT", - "TOOL_CALLING_SYSTEM_PROMPT", - "MANAGED_AGENT_PROMPT", -] diff --git a/src/smolagents/prompts/code_agent.yaml b/src/smolagents/prompts/code_agent.yaml new file mode 100644 index 000000000..b7388e207 --- /dev/null +++ b/src/smolagents/prompts/code_agent.yaml @@ -0,0 +1,333 @@ +system_prompt: |- + You are an expert assistant who can solve any task using code blobs. You will be given a task to solve as best you can. + To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code. + To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. + + At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use. + Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '' sequence. + During each intermediate step, you can use 'print()' to save whatever important information you will then need. + These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step. + In the end you have to return a final answer using the `final_answer` tool. + + Here are a few examples using notional tools: + --- + Task: "Generate an image of the oldest person in this document." + + Thought: I will proceed step by step and use the following tools: `document_qa` to find the oldest person in the document, then `image_generator` to generate an image according to the answer. + Code: + ```py + answer = document_qa(document=document, question="Who is the oldest person mentioned?") + print(answer) + ``` + Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland." + + Thought: I will now generate an image showcasing the oldest person. + Code: + ```py + image = image_generator("A portrait of John Doe, a 55-year-old man living in Canada.") + final_answer(image) + ``` + + --- + Task: "What is the result of the following operation: 5 + 3 + 1294.678?" + + Thought: I will use python code to compute the result of the operation and then return the final answer using the `final_answer` tool + Code: + ```py + result = 5 + 3 + 1294.678 + final_answer(result) + ``` + + --- + Task: + "Answer the question in the variable `question` about the image stored in the variable `image`. The question is in French. + You have been provided with these additional arguments, that you can access using the keys as variables in your python code: + {'question': 'Quel est l'animal sur l'image?', 'image': 'path/to/image.jpg'}" + + Thought: I will use the following tools: `translator` to translate the question into English and then `image_qa` to answer the question on the input image. + Code: + ```py + translated_question = translator(question=question, src_lang="French", tgt_lang="English") + print(f"The translated question is {translated_question}.") + answer = image_qa(image=image, question=translated_question) + final_answer(f"The answer is {answer}") + ``` + + --- + Task: + In a 1979 interview, Stanislaus Ulam discusses with Martin Sherwin about other great physicists of his time, including Oppenheimer. + What does he say was the consequence of Einstein learning too much math on his creativity, in one word? + + Thought: I need to find and read the 1979 interview of Stanislaus Ulam with Martin Sherwin. + Code: + ```py + pages = search(query="1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein") + print(pages) + ``` + Observation: + No result found for query "1979 interview Stanislaus Ulam Martin Sherwin physicists Einstein". + + Thought: The query was maybe too restrictive and did not find any results. Let's try again with a broader query. + Code: + ```py + pages = search(query="1979 interview Stanislaus Ulam") + print(pages) + ``` + Observation: + Found 6 pages: + [Stanislaus Ulam 1979 interview](https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/) + + [Ulam discusses Manhattan Project](https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/) + + (truncated) + + Thought: I will read the first 2 pages to know more. + Code: + ```py + for url in ["https://ahf.nuclearmuseum.org/voices/oral-histories/stanislaus-ulams-interview-1979/", "https://ahf.nuclearmuseum.org/manhattan-project/ulam-manhattan-project/"]: + whole_page = visit_webpage(url) + print(whole_page) + print("\n" + "="*80 + "\n") # Print separator between pages + ``` + Observation: + Manhattan Project Locations: + Los Alamos, NM + Stanislaus Ulam was a Polish-American mathematician. He worked on the Manhattan Project at Los Alamos and later helped design the hydrogen bomb. In this interview, he discusses his work at + (truncated) + + Thought: I now have the final answer: from the webpages visited, Stanislaus Ulam says of Einstein: "He learned too much mathematics and sort of diminished, it seems to me personally, it seems to me his purely physics creativity." Let's answer in one word. + Code: + ```py + final_answer("diminished") + ``` + + --- + Task: "Which city has the highest population: Guangzhou or Shanghai?" + + Thought: I need to get the populations for both cities and compare them: I will use the tool `search` to get the population of both cities. + Code: + ```py + for city in ["Guangzhou", "Shanghai"]: + print(f"Population {city}:", search(f"{city} population") + ``` + Observation: + Population Guangzhou: ['Guangzhou has a population of 15 million inhabitants as of 2021.'] + Population Shanghai: '26 million (2019)' + + Thought: Now I know that Shanghai has the highest population. + Code: + ```py + final_answer("Shanghai") + ``` + + --- + Task: "What is the current age of the pope, raised to the power 0.36?" + + Thought: I will use the tool `wiki` to get the age of the pope, and confirm that with a web search. + Code: + ```py + pope_age_wiki = wiki(query="current pope age") + print("Pope age as per wikipedia:", pope_age_wiki) + pope_age_search = web_search(query="current pope age") + print("Pope age as per google search:", pope_age_search) + ``` + Observation: + Pope age: "The pope Francis is currently 88 years old." + + Thought: I know that the pope is 88 years old. Let's compute the result using python code. + Code: + ```py + pope_current_age = 88 ** 0.36 + final_answer(pope_current_age) + ``` + + Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools: + {%- for tool in tools.values() %} + - {{ tool.name }}: {{ tool.description }} + Takes inputs: {{tool.inputs}} + Returns an output of type: {{tool.output_type}} + {%- endfor %} + + {%- if managed_agents and managed_agents.values() | list %} + You can also give tasks to team members. + Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task. + Given that this team member is a real human, you should be very verbose in your task. + Here is a list of the team members that you can call: + {%- for agent in managed_agents.values() %} + - {{ agent.name }}: {{ agent.description }} + {%- endfor %} + {%- else %} + {%- endif %} + + Here are the rules you should always follow to solve your task: + 1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```' sequence, else you will fail. + 2. Use only variables that you have defined! + 3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'. + 4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block. + 5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters. + 6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'. + 7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables. + 8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}} + 9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist. + 10. Don't give up! You're in charge of solving the task, not providing directions to solve it. + + Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. +planning: + initial_facts: |- + Below I will present you a task. + + You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need. + To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it. + Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey: + + --- + ### 1. Facts given in the task + List here the specific facts given in the task that could help you (there might be nothing here). + + ### 2. Facts to look up + List here any facts that we may need to look up. + Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here. + + ### 3. Facts to derive + List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation. + + Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings: + ### 1. Facts given in the task + ### 2. Facts to look up + ### 3. Facts to derive + Do not add anything else. + + Here is the task: + ``` + {{task}} + ``` + Now begin! + initial_plan : |- + You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. + + Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. + This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. + Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. + After writing the final step of the plan, write the '\n' tag and stop there. + + Here is your task: + + Task: + ``` + {{task}} + ``` + You can leverage these tools: + {%- for tool in tools.values() %} + - {{ tool.name }}: {{ tool.description }} + Takes inputs: {{tool.inputs}} + Returns an output of type: {{tool.output_type}} + {%- endfor %} + + {%- if managed_agents and managed_agents.values() | list %} + You can also give tasks to team members. + Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task. + Given that this team member is a real human, you should be very verbose in your task. + Here is a list of the team members that you can call: + {%- for agent in managed_agents.values() %} + - {{ agent.name }}: {{ agent.description }} + {%- endfor %} + {%- else %} + {%- endif %} + + List of facts that you know: + ``` + {{answer_facts}} + ``` + + Now begin! Write your plan below. + update_facts_pre_messages: |- + You are a world expert at gathering known and unknown facts based on a conversation. + Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these: + ### 1. Facts given in the task + ### 2. Facts that we have learned + ### 3. Facts still to look up + ### 4. Facts still to derive + Find the task and history below: + update_facts_post_messages: |- + Earlier we've built a list of facts. + But since in your previous steps you may have learned useful new facts or invalidated some false ones. + Please update your list of facts based on the previous history, and provide these headings: + ### 1. Facts given in the task + ### 2. Facts that we have learned + ### 3. Facts still to look up + ### 4. Facts still to derive + + Now write your new list of facts below. + update_plan_pre_messages: |- + You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. + + You have been given a task: + ``` + {{task}} + ``` + + Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task. + If the previous tries so far have met some success, you can make an updated plan based on these actions. + If you are stalled, you can make a completely new plan starting from scratch. + update_plan_post_messages: |- + You're still working towards solving this task: + ``` + {{task}} + ``` + + You can leverage these tools: + {%- for tool in tools.values() %} + - {{ tool.name }}: {{ tool.description }} + Takes inputs: {{tool.inputs}} + Returns an output of type: {{tool.output_type}} + {%- endfor %} + + {%- if managed_agents and managed_agents.values() | list %} + You can also give tasks to team members. + Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'. + Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary. + Here is a list of the team members that you can call: + {%- for agent in managed_agents.values() %} + - {{ agent.name }}: {{ agent.description }} + {%- endfor %} + {%- else %} + {%- endif %} + + Here is the up to date list of facts that you know: + ``` + {{facts_update}} + ``` + + Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. + This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. + Beware that you have {remaining_steps} steps remaining. + Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. + After writing the final step of the plan, write the '\n' tag and stop there. + + Now write your new plan below. +managed_agent: + task: |- + You're a helpful agent named '{{name}}'. + You have been submitted this task by your manager. + --- + Task: + {{task}} + --- + You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer. + + Your final_answer WILL HAVE to contain these parts: + ### 1. Task outcome (short version): + ### 2. Task outcome (extremely detailed version): + ### 3. Additional context (if relevant): + + Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost. + And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback. + report: |- + Here is the final answer from your managed agent '{{name}}': + {{final_answer}} +final_answer: + pre_messages: |- + An agent tried to answer a user query but it got stuck and failed to do so. You are tasked with providing an answer instead. Here is the agent's memory: + post_messages: |- + Based on the above, please provide an answer to the following user task: + {{task}} diff --git a/src/smolagents/prompts/toolcalling_agent.yaml b/src/smolagents/prompts/toolcalling_agent.yaml new file mode 100644 index 000000000..744bd7451 --- /dev/null +++ b/src/smolagents/prompts/toolcalling_agent.yaml @@ -0,0 +1,276 @@ +system_prompt: |- + You are an expert assistant who can solve any task using tool calls. You will be given a task to solve as best you can. + To do so, you have been given access to some tools. + + The tool call you write is an action: after the tool is executed, you will get the result of the tool call as an "observation". + This Action/Observation can repeat N times, you should take several steps when needed. + + You can use the result of the previous action as input for the next action. + The observation will always be a string: it can represent a file, like "image_1.jpg". + Then you can use it as input for the next action. You can do it for instance as follows: + + Observation: "image_1.jpg" + + Action: + { + "name": "image_transformer", + "arguments": {"image": "image_1.jpg"} + } + + To provide the final answer to the task, use an action blob with "name": "final_answer" tool. It is the only way to complete the task, else you will be stuck on a loop. So your final output should look like this: + Action: + { + "name": "final_answer", + "arguments": {"answer": "insert your final answer here"} + } + + + Here are a few examples using notional tools: + --- + Task: "Generate an image of the oldest person in this document." + + Action: + { + "name": "document_qa", + "arguments": {"document": "document.pdf", "question": "Who is the oldest person mentioned?"} + } + Observation: "The oldest person in the document is John Doe, a 55 year old lumberjack living in Newfoundland." + + Action: + { + "name": "image_generator", + "arguments": {"prompt": "A portrait of John Doe, a 55-year-old man living in Canada."} + } + Observation: "image.png" + + Action: + { + "name": "final_answer", + "arguments": "image.png" + } + + --- + Task: "What is the result of the following operation: 5 + 3 + 1294.678?" + + Action: + { + "name": "python_interpreter", + "arguments": {"code": "5 + 3 + 1294.678"} + } + Observation: 1302.678 + + Action: + { + "name": "final_answer", + "arguments": "1302.678" + } + + --- + Task: "Which city has the highest population , Guangzhou or Shanghai?" + + Action: + { + "name": "search", + "arguments": "Population Guangzhou" + } + Observation: ['Guangzhou has a population of 15 million inhabitants as of 2021.'] + + + Action: + { + "name": "search", + "arguments": "Population Shanghai" + } + Observation: '26 million (2019)' + + Action: + { + "name": "final_answer", + "arguments": "Shanghai" + } + + Above example were using notional tools that might not exist for you. You only have access to these tools: + {%- for tool in tools.values() %} + - {{ tool.name }}: {{ tool.description }} + Takes inputs: {{tool.inputs}} + Returns an output of type: {{tool.output_type}} + {%- endfor %} + + {%- if managed_agents and managed_agents.values() | list %} + You can also give tasks to team members. + Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task. + Given that this team member is a real human, you should be very verbose in your task. + Here is a list of the team members that you can call: + {%- for agent in managed_agents.values() %} + - {{ agent.name }}: {{ agent.description }} + {%- endfor %} + {%- else %} + {%- endif %} + + Here are the rules you should always follow to solve your task: + 1. ALWAYS provide a tool call, else you will fail. + 2. Always use the right arguments for the tools. Never use variable names as the action arguments, use the value instead. + 3. Call a tool only when needed: do not call the search agent if you do not need information, try to solve the task yourself. + If no tool call is needed, use final_answer tool to return your answer. + 4. Never re-do a tool call that you previously did with the exact same parameters. + + Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000. +planning: + initial_facts: |- + Below I will present you a task. + + You will now build a comprehensive preparatory survey of which facts we have at our disposal and which ones we still need. + To do so, you will have to read the task and identify things that must be discovered in order to successfully complete it. + Don't make any assumptions. For each item, provide a thorough reasoning. Here is how you will structure this survey: + + --- + ### 1. Facts given in the task + List here the specific facts given in the task that could help you (there might be nothing here). + + ### 2. Facts to look up + List here any facts that we may need to look up. + Also list where to find each of these, for instance a website, a file... - maybe the task contains some sources that you should re-use here. + + ### 3. Facts to derive + List here anything that we want to derive from the above by logical reasoning, for instance computation or simulation. + + Keep in mind that "facts" will typically be specific names, dates, values, etc. Your answer should use the below headings: + ### 1. Facts given in the task + ### 2. Facts to look up + ### 3. Facts to derive + Do not add anything else. + + Here is the task: + ``` + {{task}} + ``` + Now begin! + initial_plan : |- + You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. + + Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. + This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. + Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. + After writing the final step of the plan, write the '\n' tag and stop there. + + Here is your task: + + Task: + ``` + {{task}} + ``` + You can leverage these tools: + {%- for tool in tools.values() %} + - {{ tool.name }}: {{ tool.description }} + Takes inputs: {{tool.inputs}} + Returns an output of type: {{tool.output_type}} + {%- endfor %} + + {%- if managed_agents and managed_agents.values() | list %} + You can also give tasks to team members. + Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task. + Given that this team member is a real human, you should be very verbose in your task. + Here is a list of the team members that you can call: + {%- for agent in managed_agents.values() %} + - {{ agent.name }}: {{ agent.description }} + {%- endfor %} + {%- else %} + {%- endif %} + + List of facts that you know: + ``` + {{answer_facts}} + ``` + + Now begin! Write your plan below. + update_facts_pre_messages: |- + You are a world expert at gathering known and unknown facts based on a conversation. + Below you will find a task, and a history of attempts made to solve the task. You will have to produce a list of these: + ### 1. Facts given in the task + ### 2. Facts that we have learned + ### 3. Facts still to look up + ### 4. Facts still to derive + Find the task and history below: + update_facts_post_messages: |- + Earlier we've built a list of facts. + But since in your previous steps you may have learned useful new facts or invalidated some false ones. + Please update your list of facts based on the previous history, and provide these headings: + ### 1. Facts given in the task + ### 2. Facts that we have learned + ### 3. Facts still to look up + ### 4. Facts still to derive + + Now write your new list of facts below. + update_plan_pre_messages: |- + You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. + + You have been given a task: + ``` + {{task}} + ``` + + Find below the record of what has been tried so far to solve it. Then you will be asked to make an updated plan to solve the task. + If the previous tries so far have met some success, you can make an updated plan based on these actions. + If you are stalled, you can make a completely new plan starting from scratch. + update_plan_post_messages: |- + You're still working towards solving this task: + ``` + {{task}} + ``` + + You can leverage these tools: + {%- for tool in tools.values() %} + - {{ tool.name }}: {{ tool.description }} + Takes inputs: {{tool.inputs}} + Returns an output of type: {{tool.output_type}} + {%- endfor %} + + {%- if managed_agents and managed_agents.values() | list %} + You can also give tasks to team members. + Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task'. + Given that this team member is a real human, you should be very verbose in your task, it should be a long string providing informations as detailed as necessary. + Here is a list of the team members that you can call: + {%- for agent in managed_agents.values() %} + - {{ agent.name }}: {{ agent.description }} + {%- endfor %} + {%- else %} + {%- endif %} + + Here is the up to date list of facts that you know: + ``` + {{facts_update}} + ``` + + Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. + This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. + Beware that you have {remaining_steps} steps remaining. + Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. + After writing the final step of the plan, write the '\n' tag and stop there. + + Now write your new plan below. +managed_agent: + task: |- + You're a helpful agent named '{{name}}'. + You have been submitted this task by your manager. + --- + Task: + {{task}} + --- + You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible to give them a clear understanding of the answer. + + Your final_answer WILL HAVE to contain these parts: + ### 1. Task outcome (short version): + ### 2. Task outcome (extremely detailed version): + ### 3. Additional context (if relevant): + + Put all these in your final_answer tool, everything that you do not pass as an argument to final_answer will be lost. + And even if your task resolution is not successful, please return as much context as possible, so that your manager can act upon this feedback. + report: |- + Here is the final answer from your managed agent '{{name}}': + {{final_answer}} +final_answer: + pre_messages: |- + An agent tried to answer a user query but it got stuck and failed to do so. You are tasked with providing an answer instead. Here is the agent's memory: + post_messages: |- + Based on the above, please provide an answer to the following user task: + {{task}} diff --git a/src/smolagents/tool_validation.py b/src/smolagents/tool_validation.py index 9ac157c4e..125e68993 100644 --- a/src/smolagents/tool_validation.py +++ b/src/smolagents/tool_validation.py @@ -1,6 +1,6 @@ import ast import builtins -import inspect +from itertools import zip_longest from typing import Set from .utils import BASE_BUILTIN_MODULES, get_source @@ -25,6 +25,7 @@ def __init__(self, class_attributes: Set[str], check_imports: bool = True): self.class_attributes = class_attributes self.errors = [] self.check_imports = check_imports + self.typing_names = {"Any"} def visit_arguments(self, node): """Collect function arguments""" @@ -82,6 +83,31 @@ def visit_For(self, node): self.assigned_names.add(elt.id) self.generic_visit(node) + def _handle_comprehension_generators(self, generators): + """Helper method to handle generators in all types of comprehensions""" + for generator in generators: + if isinstance(generator.target, ast.Name): + self.assigned_names.add(generator.target.id) + elif isinstance(generator.target, ast.Tuple): + for elt in generator.target.elts: + if isinstance(elt, ast.Name): + self.assigned_names.add(elt.id) + + def visit_ListComp(self, node): + """Track variables in list comprehensions""" + self._handle_comprehension_generators(node.generators) + self.generic_visit(node) + + def visit_DictComp(self, node): + """Track variables in dictionary comprehensions""" + self._handle_comprehension_generators(node.generators) + self.generic_visit(node) + + def visit_SetComp(self, node): + """Track variables in set comprehensions""" + self._handle_comprehension_generators(node.generators) + self.generic_visit(node) + def visit_Attribute(self, node): if not (isinstance(node.value, ast.Name) and node.value.id == "self"): self.generic_visit(node) @@ -97,6 +123,7 @@ def visit_Name(self, node): or node.id in self.imports or node.id in self.from_imports or node.id in self.assigned_names + or node.id in self.typing_names ): self.errors.append(f"Name '{node.id}' is undefined.") @@ -119,7 +146,8 @@ def visit_Call(self, node): def validate_tool_attributes(cls, check_imports: bool = True) -> None: """ Validates that a Tool class follows the proper patterns: - 0. __init__ takes no argument (args chosen at init are not traceable so we cannot rebuild the source code for them, make them class attributes!). + 0. Any argument of __init__ should have a default. + Args chosen at init are not traceable, so we cannot rebuild the source code for them, thus any important arg should be defined as a class attribute. 1. About the class: - Class attributes should only be strings or dicts - Class attributes cannot be complex attributes @@ -129,34 +157,19 @@ def validate_tool_attributes(cls, check_imports: bool = True) -> None: Raises all errors encountered, if no error returns None. """ - errors = [] - - source = get_source(cls) - - tree = ast.parse(source) - - if not isinstance(tree.body[0], ast.ClassDef): - raise ValueError("Source code must define a class") - - # Check that __init__ method takes no arguments - if not cls.__init__.__qualname__ == "Tool.__init__": - sig = inspect.signature(cls.__init__) - non_self_params = list([arg_name for arg_name in sig.parameters.keys() if arg_name != "self"]) - if len(non_self_params) > 0: - errors.append( - f"This tool has additional args specified in __init__(self): {non_self_params}. Make sure it does not, all values should be hardcoded!" - ) - - class_node = tree.body[0] class ClassLevelChecker(ast.NodeVisitor): def __init__(self): self.imported_names = set() self.complex_attributes = set() self.class_attributes = set() + self.non_defaults = set() + self.non_literal_defaults = set() self.in_method = False def visit_FunctionDef(self, node): + if node.name == "__init__": + self._check_init_function_parameters(node) old_context = self.in_method self.in_method = True self.generic_visit(node) @@ -179,14 +192,39 @@ def visit_Assign(self, node): if isinstance(target, ast.Name): self.complex_attributes.add(target.id) + def _check_init_function_parameters(self, node): + # Check defaults in parameters + for arg, default in reversed(list(zip_longest(reversed(node.args.args), reversed(node.args.defaults)))): + if default is None: + if arg.arg != "self": + self.non_defaults.add(arg.arg) + elif not isinstance(default, (ast.Str, ast.Num, ast.Constant, ast.Dict, ast.List, ast.Set)): + self.non_literal_defaults.add(arg.arg) + class_level_checker = ClassLevelChecker() + source = get_source(cls) + tree = ast.parse(source) + class_node = tree.body[0] + if not isinstance(class_node, ast.ClassDef): + raise ValueError("Source code must define a class") class_level_checker.visit(class_node) + errors = [] if class_level_checker.complex_attributes: errors.append( f"Complex attributes should be defined in __init__, not as class attributes: " f"{', '.join(class_level_checker.complex_attributes)}" ) + if class_level_checker.non_defaults: + errors.append( + f"Parameters in __init__ must have default values, found required parameters: " + f"{', '.join(class_level_checker.non_defaults)}" + ) + if class_level_checker.non_literal_defaults: + errors.append( + f"Parameters in __init__ must have literal default values, found non-literal defaults: " + f"{', '.join(class_level_checker.non_literal_defaults)}" + ) # Run checks on all methods for node in class_node.body: @@ -196,5 +234,5 @@ def visit_Assign(self, node): errors += [f"- {node.name}: {error}" for error in method_checker.errors] if errors: - raise ValueError("Tool validation failed:\n" + "\n".join(errors)) + raise ValueError(f"Tool validation failed for {cls.__name__}:\n" + "\n".join(errors)) return diff --git a/src/smolagents/tools.py b/src/smolagents/tools.py index 10b22ea03..3f8b25a26 100644 --- a/src/smolagents/tools.py +++ b/src/smolagents/tools.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import ast -import importlib import inspect import json import logging @@ -23,8 +22,9 @@ import sys import tempfile import textwrap +import types from contextlib import contextmanager -from functools import lru_cache, wraps +from functools import wraps from pathlib import Path from typing import Callable, Dict, List, Optional, Union @@ -36,7 +36,6 @@ upload_folder, ) from huggingface_hub.utils import is_torch_available -from packaging import version from ._function_type_hints_utils import ( TypeHintParsingException, @@ -44,8 +43,8 @@ get_imports, get_json_schema, ) +from .agent_types import handle_agent_input_types, handle_agent_output_types from .tool_validation import MethodChecker, validate_tool_attributes -from .types import handle_agent_input_types, handle_agent_output_types from .utils import _is_package_available, _is_pillow_available, get_source, instance_to_source @@ -200,24 +199,9 @@ def setup(self): """ self.is_initialized = True - def save(self, output_dir): - """ - Saves the relevant code files for your tool so it can be pushed to the Hub. This will copy the code of your - tool in `output_dir` as well as autogenerate: - - - a `tool.py` file containing the logic for your tool. - - an `app.py` file providing an UI for your tool when it is exported to a Space with `tool.push_to_hub()` - - a `requirements.txt` containing the names of the module used by your tool (as detected when inspecting its - code) - - Args: - output_dir (`str`): The folder in which you want to save your tool. - """ - os.makedirs(output_dir, exist_ok=True) + def to_dict(self) -> dict: + """Returns a dictionary representing the tool""" class_name = self.__class__.__name__ - tool_file = os.path.join(output_dir, "tool.py") - - # Save tool file if type(self).__name__ == "SimpleTool": # Check that imports are self-contained source_code = get_source(self.forward).replace("@tool", "") @@ -233,11 +217,11 @@ def save(self, output_dir): tool_code = textwrap.dedent( f""" from smolagents import Tool - from typing import Optional + from typing import Any, Optional class {class_name}(Tool): name = "{self.name}" - description = "{self.description}" + description = {json.dumps(textwrap.dedent(self.description).strip())} inputs = {json.dumps(self.inputs, separators=(",", ":"))} output_type = "{self.output_type}" """ @@ -273,33 +257,59 @@ def replacement(match): validate_tool_attributes(self.__class__) - tool_code = instance_to_source(self, base_cls=Tool) + tool_code = "from typing import Any, Optional\n" + instance_to_source(self, base_cls=Tool) - with open(tool_file, "w", encoding="utf-8") as f: - f.write(tool_code.replace(":true,", ":True,").replace(":true}", ":True}")) + requirements = {el for el in get_imports(tool_code) if el not in sys.stdlib_module_names} | {"smolagents"} - # Save app file - app_file = os.path.join(output_dir, "app.py") - with open(app_file, "w", encoding="utf-8") as f: - f.write( - textwrap.dedent( - f""" - from smolagents import launch_gradio_demo - from typing import Optional - from tool import {class_name} + return {"name": self.name, "code": tool_code, "requirements": requirements} - tool = {class_name}() + def save(self, output_dir: str, tool_file_name: str = "tool", make_gradio_app: bool = True): + """ + Saves the relevant code files for your tool so it can be pushed to the Hub. This will copy the code of your + tool in `output_dir` as well as autogenerate: - launch_gradio_demo(tool) - """ - ).lstrip() - ) + - a `{tool_file_name}.py` file containing the logic for your tool. + If you pass `make_gradio_app=True`, this will also write: + - an `app.py` file providing a UI for your tool when it is exported to a Space with `tool.push_to_hub()` + - a `requirements.txt` containing the names of the modules used by your tool (as detected when inspecting its + code) - # Save requirements file - imports = {el for el in get_imports(tool_file) if el not in sys.stdlib_module_names} | {"smolagents"} - requirements_file = os.path.join(output_dir, "requirements.txt") - with open(requirements_file, "w", encoding="utf-8") as f: - f.write("\n".join(imports) + "\n") + Args: + output_dir (`str`): The folder in which you want to save your tool. + tool_file_name (`str`, *optional*): The file name in which you want to save your tool. + make_gradio_app (`bool`, *optional*, defaults to True): Whether to also export a `requirements.txt` file and Gradio UI. + """ + os.makedirs(output_dir, exist_ok=True) + class_name = self.__class__.__name__ + tool_file = os.path.join(output_dir, f"{tool_file_name}.py") + + tool_dict = self.to_dict() + tool_code = tool_dict["code"] + + with open(tool_file, "w", encoding="utf-8") as f: + f.write(tool_code.replace(":true,", ":True,").replace(":true}", ":True}")) + + if make_gradio_app: + # Save app file + app_file = os.path.join(output_dir, "app.py") + with open(app_file, "w", encoding="utf-8") as f: + f.write( + textwrap.dedent( + f""" + from smolagents import launch_gradio_demo + from {tool_file_name} import {class_name} + + tool = {class_name}() + + launch_gradio_demo(tool) + """ + ).lstrip() + ) + + # Save requirements file + requirements_file = os.path.join(output_dir, "requirements.txt") + with open(requirements_file, "w", encoding="utf-8") as f: + f.write("\n".join(tool_dict["requirements"]) + "\n") def push_to_hub( self, @@ -312,14 +322,6 @@ def push_to_hub( """ Upload the tool to the Hub. - For this method to work properly, your tool must have been defined in a separate module (not `__main__`). - For instance: - ``` - from my_tool_module import MyTool - my_tool = MyTool() - my_tool.push_to_hub("my-username/my-space") - ``` - Parameters: repo_id (`str`): The name of the repository you want to push your tool to. It should contain your organization name when @@ -343,13 +345,11 @@ def push_to_hub( space_sdk="gradio", ) repo_id = repo_url.repo_id - metadata_update(repo_id, {"tags": ["tool"]}, repo_type="space") + metadata_update(repo_id, {"tags": ["smolagents", "tool"]}, repo_type="space", token=token) with tempfile.TemporaryDirectory() as work_dir: # Save all files. self.save(work_dir) - with open(work_dir + "/tool.py", "r") as f: - print("\n".join(f.readlines())) logger.info(f"Uploading the following files to {repo_id}: {','.join(os.listdir(work_dir))}") return upload_folder( repo_id=repo_id, @@ -395,7 +395,7 @@ def from_hub( """ if not trust_remote_code: raise ValueError( - "Loading a tool from Hub requires to trust remote code. Make sure you've inspected the repo and pass `trust_remote_code=True` to load the tool." + "Loading a tool from Hub requires to acknowledge you trust its code: to do so, pass `trust_remote_code=True`." ) # Get the tool's tool.py file. @@ -406,7 +406,6 @@ def from_hub( repo_type="space", cache_dir=kwargs.get("cache_dir"), force_download=kwargs.get("force_download"), - resume_download=kwargs.get("resume_download"), proxies=kwargs.get("proxies"), revision=kwargs.get("revision"), subfolder=kwargs.get("subfolder"), @@ -414,30 +413,26 @@ def from_hub( ) tool_code = Path(tool_file).read_text() + return Tool.from_code(tool_code, **kwargs) - # Find the Tool subclass in the namespace - with tempfile.TemporaryDirectory() as temp_dir: - # Save the code to a file - module_path = os.path.join(temp_dir, "tool.py") - with open(module_path, "w") as f: - f.write(tool_code) - - print("TOOL CODE:\n", tool_code) - - # Load module from file path - spec = importlib.util.spec_from_file_location("tool", module_path) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - - # Find and instantiate the Tool class - for item_name in dir(module): - item = getattr(module, item_name) - if isinstance(item, type) and issubclass(item, Tool) and item != Tool: - tool_class = item - break + @classmethod + def from_code(cls, tool_code: str, **kwargs): + module = types.ModuleType("dynamic_tool") + + exec(tool_code, module.__dict__) + + # Find the Tool subclass + tool_class = next( + ( + obj + for _, obj in inspect.getmembers(module, inspect.isclass) + if issubclass(obj, Tool) and obj is not Tool + ), + None, + ) - if tool_class is None: - raise ValueError("No Tool subclass found in the code.") + if tool_class is None: + raise ValueError("No Tool subclass found in the code.") if not isinstance(tool_class.inputs, dict): tool_class.inputs = ast.literal_eval(tool_class.inputs) @@ -632,50 +627,13 @@ def forward(self, *args, **kwargs): return LangChainToolWrapper(langchain_tool) -DEFAULT_TOOL_DESCRIPTION_TEMPLATE = """ -- {{ tool.name }}: {{ tool.description }} - Takes inputs: {{tool.inputs}} - Returns an output of type: {{tool.output_type}} -""" - - -def get_tool_description_with_args(tool: Tool, description_template: Optional[str] = None) -> str: - if description_template is None: - description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE - compiled_template = compile_jinja_template(description_template) - tool_description = compiled_template.render( - tool=tool, - ) - return tool_description - - -@lru_cache -def compile_jinja_template(template): - try: - import jinja2 - from jinja2.exceptions import TemplateError - from jinja2.sandbox import ImmutableSandboxedEnvironment - except ImportError: - raise ImportError("template requires jinja2 to be installed.") - - if version.parse(jinja2.__version__) < version.parse("3.1.0"): - raise ImportError(f"template requires jinja2>=3.1.0 to be installed. Your version is {jinja2.__version__}.") - - def raise_exception(message): - raise TemplateError(message) - - jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True) - jinja_env.globals["raise_exception"] = raise_exception - return jinja_env.from_string(template) - - def launch_gradio_demo(tool: Tool): """ Launches a gradio demo for a tool. The corresponding tool class needs to properly implement the class attributes `inputs` and `output_type`. Args: - tool (`type`): The tool for which to launch the demo. + tool (`Tool`): The tool for which to launch the demo. """ try: import gradio as gr @@ -709,14 +667,13 @@ def tool_forward(*args, **kwargs): inputs=gradio_inputs, outputs=gradio_output, title=tool.name, - article=tool.description, description=tool.description, api_name=tool.name, ).launch() def load_tool( - task_or_repo_id, + repo_id, model_repo_id: Optional[str] = None, token: Optional[str] = None, trust_remote_code: bool = False, @@ -734,16 +691,8 @@ def load_tool(
Args: - task_or_repo_id (`str`): - The task for which to load the tool or a repo ID of a tool on the Hub. Tasks implemented in Transformers - are: - - - `"document_question_answering"` - - `"image_question_answering"` - - `"speech_to_text"` - - `"text_to_speech"` - - `"translation"` - + repo_id (`str`): + Repo ID of a tool on the Hub. model_repo_id (`str`, *optional*): Use this argument to use a different model than the default one for the tool you selected. token (`str`, *optional*): @@ -757,7 +706,7 @@ def load_tool( will be passed along to its init. """ return Tool.from_hub( - task_or_repo_id, + repo_id, model_repo_id=model_repo_id, token=token, trust_remote_code=trust_remote_code, diff --git a/src/smolagents/utils.py b/src/smolagents/utils.py index 8aa631f1a..3f7219b61 100644 --- a/src/smolagents/utils.py +++ b/src/smolagents/utils.py @@ -20,15 +20,17 @@ import importlib.util import inspect import json +import os import re import textwrap import types -from enum import IntEnum from functools import lru_cache from io import BytesIO -from typing import Dict, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Tuple, Union -from rich.console import Console + +if TYPE_CHECKING: + from smolagents.memory import AgentLogger __all__ = ["AgentError"] @@ -48,8 +50,6 @@ def _is_pillow_available(): return importlib.util.find_spec("PIL") is not None -console = Console() - BASE_BUILTIN_MODULES = [ "collections", "datetime", @@ -65,29 +65,16 @@ def _is_pillow_available(): ] -class LogLevel(IntEnum): - ERROR = 0 # Only errors - INFO = 1 # Normal output (default) - DEBUG = 2 # Detailed output - - -class AgentLogger: - def __init__(self, level: LogLevel = LogLevel.INFO): - self.level = level - self.console = Console() - - def log(self, *args, level: LogLevel = LogLevel.INFO, **kwargs): - if level <= self.level: - self.console.print(*args, **kwargs) - - class AgentError(Exception): """Base class for other agent-related exceptions""" - def __init__(self, message, logger: AgentLogger): + def __init__(self, message, logger: "AgentLogger"): super().__init__(message) self.message = message - logger.log(f"[bold red]{message}[/bold red]", level=LogLevel.ERROR) + logger.log(f"[bold red]{message}[/bold red]", level="ERROR") + + def dict(self) -> Dict[str, str]: + return {"type": self.__class__.__name__, "message": str(self.message)} class AgentParsingError(AgentError): @@ -114,6 +101,32 @@ class AgentGenerationError(AgentError): pass +def make_json_serializable(obj: Any) -> Any: + """Recursive function to make objects JSON serializable""" + if obj is None: + return None + elif isinstance(obj, (str, int, float, bool)): + # Try to parse string as JSON if it looks like a JSON object/array + if isinstance(obj, str): + try: + if (obj.startswith("{") and obj.endswith("}")) or (obj.startswith("[") and obj.endswith("]")): + parsed = json.loads(obj) + return make_json_serializable(parsed) + except json.JSONDecodeError: + pass + return obj + elif isinstance(obj, (list, tuple)): + return [make_json_serializable(item) for item in obj] + elif isinstance(obj, dict): + return {str(k): make_json_serializable(v) for k, v in obj.items()} + elif hasattr(obj, "__dict__"): + # For custom objects, convert their __dict__ to a serializable format + return {"_type": obj.__class__.__name__, **{k: make_json_serializable(v) for k, v in obj.__dict__.items()}} + else: + # For any other type, convert to string + return str(obj) + + def parse_json_blob(json_blob: str) -> Dict[str, str]: try: first_accolade_index = json_blob.find("{") @@ -150,7 +163,10 @@ def parse_code_blobs(code_blob: str) -> str: if "final" in code_blob and "answer" in code_blob: raise ValueError( f""" -The code blob is invalid, because the regex pattern {pattern} was not found in {code_blob=}. It seems like you're trying to return the final answer, you can do it as follows: +Your code snippet is invalid, because the regex pattern {pattern} was not found in it. +Here is your code snippet: +{code_blob} +It seems like you're trying to return the final answer, you can do it as follows: Code: ```py final_answer("YOUR FINAL ANSWER HERE") @@ -158,7 +174,10 @@ def parse_code_blobs(code_blob: str) -> str: ) raise ValueError( f""" -The code blob is invalid, because the regex pattern {pattern} was not found in {code_blob=}. Make sure to include code with the correct pattern, for instance: +Your code snippet is invalid, because the regex pattern {pattern} was not found in it. +Here is your code snippet: +{code_blob} +Make sure to include code with the correct pattern, for instance: Thoughts: Your thoughts Code: ```py @@ -280,10 +299,12 @@ def instance_to_source(instance, base_cls=None): for name, value in class_attrs.items(): if isinstance(value, str): + # multiline value if "\n" in value: - class_lines.append(f' {name} = """{value}"""') + escaped_value = value.replace('"""', r"\"\"\"") # Escape triple quotes + class_lines.append(f' {name} = """{escaped_value}"""') else: - class_lines.append(f' {name} = "{value}"') + class_lines.append(f" {name} = {json.dumps(value)}") else: class_lines.append(f" {name} = {repr(value)}") @@ -396,3 +417,10 @@ def encode_image_base64(image): def make_image_url(base64_image): return f"data:image/png;base64,{base64_image}" + + +def make_init_file(folder: str): + os.makedirs(folder, exist_ok=True) + # Create __init__ + with open(os.path.join(folder, "__init__.py"), "w"): + pass diff --git a/examples/vlm_web_browser.py b/src/smolagents/vision_web_browser.py similarity index 54% rename from examples/vlm_web_browser.py rename to src/smolagents/vision_web_browser.py index 01d50a517..46a07f99d 100644 --- a/examples/vlm_web_browser.py +++ b/src/smolagents/vision_web_browser.py @@ -1,3 +1,4 @@ +import argparse from io import BytesIO from time import sleep @@ -5,73 +6,69 @@ from dotenv import load_dotenv from PIL import Image from selenium import webdriver -from selenium.common.exceptions import ElementNotInteractableException, TimeoutException from selenium.webdriver.common.by import By -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.common.keys import Keys -from smolagents import CodeAgent, LiteLLMModel, OpenAIServerModel, TransformersModel, tool # noqa: F401 +from smolagents import CodeAgent, DuckDuckGoSearchTool, tool from smolagents.agents import ActionStep +from smolagents.cli import load_model -load_dotenv() -import os - - -# Let's use Qwen-2VL-72B via an inference provider like Fireworks AI - -model = OpenAIServerModel( - api_key=os.getenv("FIREWORKS_API_KEY"), - api_base="https://api.fireworks.ai/inference/v1", - model_id="accounts/fireworks/models/qwen2-vl-72b-instruct", -) - -# You can also use a close model - -# model = LiteLLMModel( -# model_id="gpt-4o", -# api_key=os.getenv("OPENAI_API_KEY"), -# ) +github_request = """ +I'm trying to find how hard I have to work to get a repo in github.com/trending. +Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year? +""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet. -# locally a good candidate is Qwen2-VL-7B-Instruct -# model = TransformersModel( -# model_id="Qwen/Qwen2-VL-7B-Instruct", -# device_map = "auto", -# flatten_messages_as_text=False -# ) +search_request = """ +Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident. +""" -# Prepare callback -def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None: +def parse_arguments(): + parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.") + parser.add_argument( + "prompt", + type=str, + nargs="?", # Makes it optional + default=search_request, + help="The prompt to run with the agent", + ) + parser.add_argument( + "--model-type", + type=str, + default="LiteLLMModel", + help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)", + ) + parser.add_argument( + "--model-id", + type=str, + default="gpt-4o", + help="The model ID to use for the specified model type", + ) + return parser.parse_args() + + +def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: sleep(1.0) # Let JavaScript animations happen before taking the screenshot driver = helium.get_driver() - current_step = step_log.step_number + current_step = memory_step.step_number if driver is not None: - for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing - if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2: - step_logs.observations_images = None + for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing + if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2: + previous_memory_step.observations_images = None png_bytes = driver.get_screenshot_as_png() image = Image.open(BytesIO(png_bytes)) print(f"Captured a browser screenshot: {image.size} pixels") - step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important! + memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important! # Update observations with current URL url_info = f"Current url: {driver.current_url}" - step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info + memory_step.observations = ( + url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info + ) return -# Initialize driver and agent -chrome_options = webdriver.ChromeOptions() -chrome_options.add_argument("--force-device-scale-factor=1") -chrome_options.add_argument("--window-size=1000,1300") -chrome_options.add_argument("--disable-pdf-viewer") - -driver = helium.start_chrome(headless=False, options=chrome_options) - -# Initialize tools - - @tool def search_item_ctrl_f(text: str, nth_result: int = 1) -> str: """ @@ -101,59 +98,39 @@ def close_popups() -> str: """ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners. """ - # Common selectors for modal close buttons and overlay elements - modal_selectors = [ - "button[class*='close']", - "[class*='modal']", - "[class*='modal'] button", - "[class*='CloseButton']", - "[aria-label*='close']", - ".modal-close", - ".close-modal", - ".modal .close", - ".modal-backdrop", - ".modal-overlay", - "[class*='overlay']", - ] - - wait = WebDriverWait(driver, timeout=0.5) - - for selector in modal_selectors: - try: - elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector))) - - for element in elements: - if element.is_displayed(): - try: - # Try clicking with JavaScript as it's more reliable - driver.execute_script("arguments[0].click();", element) - except ElementNotInteractableException: - # If JavaScript click fails, try regular click - element.click() - - except TimeoutException: - continue - except Exception as e: - print(f"Error handling selector {selector}: {str(e)}") - continue - return "Modals closed" - - -agent = CodeAgent( - tools=[go_back, close_popups, search_item_ctrl_f], - model=model, - additional_authorized_imports=["helium"], - step_callbacks=[save_screenshot], - max_steps=20, - verbosity_level=2, -) + webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform() + + +def initialize_driver(): + """Initialize the Selenium WebDriver.""" + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument("--force-device-scale-factor=1") + chrome_options.add_argument("--window-size=1000,1350") + chrome_options.add_argument("--disable-pdf-viewer") + chrome_options.add_argument("--window-position=0,0") + return helium.start_chrome(headless=False, options=chrome_options) + + +def initialize_agent(model): + """Initialize the CodeAgent with the specified model.""" + return CodeAgent( + tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], + model=model, + additional_authorized_imports=["helium"], + step_callbacks=[save_screenshot], + max_steps=20, + verbosity_level=2, + ) + helium_instructions = """ -You can use helium to access websites. Don't bother about the helium driver, it's already managed. -First you need to import everything from helium, then you can do other actions! +Use your web_search tool when you want to get Google search results. +Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites! +Don't bother about the helium driver, it's already managed. +We've already ran "from helium import *" +Then you can go to pages! Code: ```py -from helium import * go_to('github.com/trending') ``` @@ -206,17 +183,28 @@ def close_popups() -> str: After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url. But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states. Don't kill the browser. +When you have modals or cookie banners on screen, you should get rid of them before you can click anything else. """ -# Run the agent! -github_request = """ -I'm trying to find how hard I have to work to get a repo in github.com/trending. -Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year? -""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet. +def main(): + # Load environment variables + load_dotenv() + + # Parse command line arguments + args = parse_arguments() + + # Initialize the model based on the provided arguments + model = load_model(args.model_type, args.model_id) + + global driver + driver = initialize_driver() + agent = initialize_agent(model) + + # Run the agent with the provided prompt + agent.python_executor("from helium import *", agent.state) + agent.run(args.prompt + helium_instructions) -search_request = """ -Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident. -""" -agent.run(search_request + helium_instructions) +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..a3896e2db --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,20 @@ +from unittest.mock import patch + +import pytest + +from smolagents.agents import MultiStepAgent +from smolagents.monitoring import LogLevel + + +original_multi_step_agent_init = MultiStepAgent.__init__ + + +@pytest.fixture(autouse=True) +def patch_multi_step_agent_with_suppressed_logging(): + with patch.object(MultiStepAgent, "__init__", autospec=True) as mock_init: + + def init_with_suppressed_logging(self, *args, verbosity_level=LogLevel.OFF, **kwargs): + original_multi_step_agent_init(self, *args, verbosity_level=verbosity_level, **kwargs) + + mock_init.side_effect = init_with_suppressed_logging + yield diff --git a/tests/test_agents.py b/tests/test_agents.py index 1dcb5e933..376cc0869 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -17,24 +17,31 @@ import unittest import uuid from pathlib import Path +from unittest.mock import MagicMock +import pytest from transformers.testing_utils import get_tests_dir +from smolagents.agent_types import AgentImage, AgentText from smolagents.agents import ( AgentMaxStepsError, CodeAgent, - ManagedAgent, + MultiStepAgent, ToolCall, ToolCallingAgent, + populate_template, ) -from smolagents.default_tools import PythonInterpreterTool +from smolagents.default_tools import DuckDuckGoSearchTool, PythonInterpreterTool, VisitWebpageTool +from smolagents.memory import PlanningStep from smolagents.models import ( ChatMessage, ChatMessageToolCall, ChatMessageToolCallDefinition, + HfApiModel, + MessageRole, + TransformersModel, ) from smolagents.tools import tool -from smolagents.types import AgentImage, AgentText from smolagents.utils import BASE_BUILTIN_MODULES @@ -173,6 +180,7 @@ def fake_code_model_error(messages, stop_sequences=None) -> str: Thought: I should multiply 2 by 3.6452. special_marker Code: ```py +print("Flag!") def error_function(): raise ValueError("error") @@ -296,20 +304,14 @@ def fake_code_model_no_return(messages, stop_sequences=None, grammar=None) -> st class AgentTests(unittest.TestCase): - def test_fake_single_step_code_agent(self): - agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_single_step) - output = agent.run("What is 2 multiplied by 3.6452?", single_step=True) - assert isinstance(output, str) - assert "7.2904" in output - def test_fake_toolcalling_agent(self): agent = ToolCallingAgent(tools=[PythonInterpreterTool()], model=FakeToolCallModel()) output = agent.run("What is 2 multiplied by 3.6452?") assert isinstance(output, str) assert "7.2904" in output - assert agent.logs[1].task == "What is 2 multiplied by 3.6452?" - assert "7.2904" in agent.logs[2].observations - assert agent.logs[3].llm_output is None + assert agent.memory.steps[0].task == "What is 2 multiplied by 3.6452?" + assert "7.2904" in agent.memory.steps[1].observations + assert agent.memory.steps[2].model_output is None def test_toolcalling_agent_handles_image_tool_outputs(self): from PIL import Image @@ -352,9 +354,9 @@ def test_fake_code_agent(self): output = agent.run("What is 2 multiplied by 3.6452?") assert isinstance(output, float) assert output == 7.2904 - assert agent.logs[1].task == "What is 2 multiplied by 3.6452?" - assert agent.logs[3].tool_calls == [ - ToolCall(name="python_interpreter", arguments="final_answer(7.2904)", id="call_3") + assert agent.memory.steps[0].task == "What is 2 multiplied by 3.6452?" + assert agent.memory.steps[2].tool_calls == [ + ToolCall(name="python_interpreter", arguments="final_answer(7.2904)", id="call_2") ] def test_additional_args_added_to_task(self): @@ -370,30 +372,35 @@ def test_reset_conversations(self): agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model) output = agent.run("What is 2 multiplied by 3.6452?", reset=True) assert output == 7.2904 - assert len(agent.logs) == 4 + assert len(agent.memory.steps) == 3 output = agent.run("What is 2 multiplied by 3.6452?", reset=False) assert output == 7.2904 - assert len(agent.logs) == 6 + assert len(agent.memory.steps) == 5 output = agent.run("What is 2 multiplied by 3.6452?", reset=True) assert output == 7.2904 - assert len(agent.logs) == 4 + assert len(agent.memory.steps) == 3 def test_code_agent_code_errors_show_offending_line_and_error(self): agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_error) output = agent.run("What is 2 multiplied by 3.6452?") assert isinstance(output, AgentText) assert output == "got an error" - assert "Code execution failed at line 'error_function()'" in str(agent.logs[2].error) - assert "ValueError" in str(agent.logs) + assert "Code execution failed at line 'error_function()'" in str(agent.memory.steps[1].error) + assert "ValueError" in str(agent.memory.steps) + + def test_code_agent_code_error_saves_previous_print_outputs(self): + agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_error) + agent.run("What is 2 multiplied by 3.6452?") + assert "Flag!" in str(agent.memory.steps[1].observations) def test_code_agent_syntax_error_show_offending_lines(self): agent = CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model_syntax_error) output = agent.run("What is 2 multiplied by 3.6452?") assert isinstance(output, AgentText) assert output == "got an error" - assert ' print("Failing due to unexpected indent")' in str(agent.logs) + assert ' print("Failing due to unexpected indent")' in str(agent.memory.steps) def test_setup_agent_with_empty_toolbox(self): ToolCallingAgent(model=FakeToolCallModel(), tools=[]) @@ -405,8 +412,8 @@ def test_fails_max_steps(self): max_steps=5, ) answer = agent.run("What is 2 multiplied by 3.6452?") - assert len(agent.logs) == 8 - assert type(agent.logs[-1].error) is AgentMaxStepsError + assert len(agent.memory.steps) == 7 # Task step + 5 action steps + Final answer + assert type(agent.memory.steps[-1].error) is AgentMaxStepsError assert isinstance(answer, str) def test_tool_descriptions_get_baked_in_system_prompt(self): @@ -430,10 +437,15 @@ def test_init_agent_with_different_toolsets(self): assert len(agent.tools) == 1 # when no tools are provided, only the final_answer tool is added by default toolset_2 = [PythonInterpreterTool(), PythonInterpreterTool()] - agent = CodeAgent(tools=toolset_2, model=fake_code_model) - assert ( - len(agent.tools) == 2 - ) # deduplication of tools, so only one python_interpreter tool is added in addition to final_answer + with pytest.raises(ValueError) as e: + agent = CodeAgent(tools=toolset_2, model=fake_code_model) + assert "Each tool or managed_agent should have a unique name!" in str(e) + + with pytest.raises(ValueError) as e: + agent.name = "python_interpreter" + agent.description = "empty" + CodeAgent(tools=[PythonInterpreterTool()], model=fake_code_model, managed_agents=[agent]) + assert "Each tool or managed_agent should have a unique name!" in str(e) # check that python_interpreter base tool does not get added to CodeAgent agent = CodeAgent(tools=[], model=fake_code_model, add_base_tools=True) @@ -454,34 +466,386 @@ def test_function_persistence_across_steps(self): assert res[0] == 0.5 def test_init_managed_agent(self): - agent = CodeAgent(tools=[], model=fake_code_functiondef) - managed_agent = ManagedAgent(agent, name="managed_agent", description="Empty") - assert managed_agent.name == "managed_agent" - assert managed_agent.description == "Empty" + agent = CodeAgent(tools=[], model=fake_code_functiondef, name="managed_agent", description="Empty") + assert agent.name == "managed_agent" + assert agent.description == "Empty" def test_agent_description_gets_correctly_inserted_in_system_prompt(self): - agent = CodeAgent(tools=[], model=fake_code_functiondef) - managed_agent = ManagedAgent(agent, name="managed_agent", description="Empty") + managed_agent = CodeAgent(tools=[], model=fake_code_functiondef, name="managed_agent", description="Empty") manager_agent = CodeAgent( tools=[], model=fake_code_functiondef, managed_agents=[managed_agent], ) - assert "You can also give requests to team members." not in agent.system_prompt - print("ok1") - assert "{{managed_agents_descriptions}}" not in agent.system_prompt - assert "You can also give requests to team members." in manager_agent.system_prompt + assert "You can also give tasks to team members." not in managed_agent.system_prompt + assert "{{managed_agents_descriptions}}" not in managed_agent.system_prompt + assert "You can also give tasks to team members." in manager_agent.system_prompt def test_code_agent_missing_import_triggers_advice_in_error_log(self): - agent = CodeAgent(tools=[], model=fake_code_model_import) + # Set explicit verbosity level to 1 to override the default verbosity level of -1 set in CI fixture + agent = CodeAgent(tools=[], model=fake_code_model_import, verbosity_level=1) with agent.logger.console.capture() as capture: agent.run("Count to 3") str_output = capture.get() - assert "Consider passing said import under" in str_output.replace("\n", "") + assert "`additional_authorized_imports`" in str_output.replace("\n", "") + + def test_replay_shows_logs(self): + agent = CodeAgent( + tools=[], model=fake_code_model_import, verbosity_level=0, additional_authorized_imports=["numpy"] + ) + agent.run("Count to 3") + + with agent.logger.console.capture() as capture: + agent.replay() + str_output = capture.get().replace("\n", "") + assert "New run" in str_output + assert "Agent output:" in str_output + assert 'final_answer("got' in str_output + assert "```" in str_output + + def test_code_nontrivial_final_answer_works(self): + def fake_code_model_final_answer(messages, stop_sequences=None, grammar=None): + return ChatMessage( + role="assistant", + content="""Code: +```py +def nested_answer(): + final_answer("Correct!") + +nested_answer() +```""", + ) + + agent = CodeAgent(tools=[], model=fake_code_model_final_answer) + + output = agent.run("Count to 3") + assert output == "Correct!" + + def test_transformers_toolcalling_agent(self): + @tool + def weather_api(location: str, celsius: bool = False) -> str: + """ + Gets the weather in the next days at given location. + Secretly this tool does not care about the location, it hates the weather everywhere. + + Args: + location: the location + celsius: the temperature type + """ + return "The weather is UNGODLY with torrential rains and temperatures below -10°C" + + model = TransformersModel( + model_id="HuggingFaceTB/SmolLM2-360M-Instruct", + max_new_tokens=100, + device_map="auto", + do_sample=False, + ) + agent = ToolCallingAgent(model=model, tools=[weather_api], max_steps=1) + agent.run("What's the weather in Paris?") + assert agent.memory.steps[0].task == "What's the weather in Paris?" + assert agent.memory.steps[1].tool_calls[0].name == "weather_api" + step_memory_dict = agent.memory.get_succinct_steps()[1] + assert step_memory_dict["model_output_message"].tool_calls[0].function.name == "weather_api" + assert step_memory_dict["model_output_message"].raw["completion_kwargs"]["max_new_tokens"] == 100 + assert "model_input_messages" in agent.memory.get_full_steps()[1] + + def test_final_answer_checks(self): + def check_always_fails(final_answer, agent_memory): + assert False, "Error raised in check" + + agent = CodeAgent(model=fake_code_model, tools=[], final_answer_checks=[check_always_fails]) + agent.run("Dummy task.") + assert "Error raised in check" in str(agent.write_memory_to_messages()) + + +class TestMultiStepAgent: + def test_instantiation_disables_logging_to_terminal(self): + fake_model = MagicMock() + agent = MultiStepAgent(tools=[], model=fake_model) + assert agent.logger.level == -1, "logging to terminal should be disabled for testing using a fixture" + + def test_instantiation_with_prompt_templates(self, prompt_templates): + agent = MultiStepAgent(tools=[], model=MagicMock(), prompt_templates=prompt_templates) + assert agent.prompt_templates == prompt_templates + assert agent.prompt_templates["system_prompt"] == "This is a test system prompt." + assert "managed_agent" in agent.prompt_templates + assert agent.prompt_templates["managed_agent"]["task"] == "Task for {{name}}: {{task}}" + assert agent.prompt_templates["managed_agent"]["report"] == "Report for {{name}}: {{final_answer}}" + + def test_step_number(self): + fake_model = MagicMock() + fake_model.last_input_token_count = 10 + fake_model.last_output_token_count = 20 + max_steps = 2 + agent = MultiStepAgent(tools=[], model=fake_model, max_steps=max_steps) + assert hasattr(agent, "step_number"), "step_number attribute should be defined" + assert agent.step_number == 0, "step_number should be initialized to 0" + agent.run("Test task") + assert hasattr(agent, "step_number"), "step_number attribute should be defined" + assert agent.step_number == max_steps + 1, "step_number should be max_steps + 1 after run method is called" + + @pytest.mark.parametrize( + "step, expected_messages_list", + [ + ( + 1, + [ + [{"role": MessageRole.USER, "content": [{"type": "text", "text": "INITIAL_FACTS_USER_PROMPT"}]}], + [{"role": MessageRole.USER, "content": [{"type": "text", "text": "INITIAL_PLAN_USER_PROMPT"}]}], + ], + ), + ( + 2, + [ + [ + { + "role": MessageRole.SYSTEM, + "content": [{"type": "text", "text": "UPDATE_FACTS_SYSTEM_PROMPT"}], + }, + {"role": MessageRole.USER, "content": [{"type": "text", "text": "UPDATE_FACTS_USER_PROMPT"}]}, + ], + [ + { + "role": MessageRole.SYSTEM, + "content": [{"type": "text", "text": "UPDATE_PLAN_SYSTEM_PROMPT"}], + }, + {"role": MessageRole.USER, "content": [{"type": "text", "text": "UPDATE_PLAN_USER_PROMPT"}]}, + ], + ], + ), + ], + ) + def test_planning_step(self, step, expected_messages_list): + fake_model = MagicMock() + agent = CodeAgent( + tools=[], + model=fake_model, + ) + task = "Test task" + agent.planning_step(task, is_first_step=(step == 1), step=step) + expected_message_texts = { + "INITIAL_FACTS_USER_PROMPT": populate_template( + agent.prompt_templates["planning"]["initial_facts"], variables=dict(task=task) + ), + "INITIAL_PLAN_USER_PROMPT": populate_template( + agent.prompt_templates["planning"]["initial_plan"], + variables=dict( + task=task, + tools=agent.tools, + managed_agents=agent.managed_agents, + answer_facts=agent.memory.steps[0].model_output_message_facts.content, + ), + ), + "UPDATE_FACTS_SYSTEM_PROMPT": agent.prompt_templates["planning"]["update_facts_pre_messages"], + "UPDATE_FACTS_USER_PROMPT": agent.prompt_templates["planning"]["update_facts_post_messages"], + "UPDATE_PLAN_SYSTEM_PROMPT": populate_template( + agent.prompt_templates["planning"]["update_plan_pre_messages"], variables=dict(task=task) + ), + "UPDATE_PLAN_USER_PROMPT": populate_template( + agent.prompt_templates["planning"]["update_plan_post_messages"], + variables=dict( + task=task, + tools=agent.tools, + managed_agents=agent.managed_agents, + facts_update=agent.memory.steps[0].model_output_message_facts.content, + remaining_steps=agent.max_steps - step, + ), + ), + } + for expected_messages in expected_messages_list: + for expected_message in expected_messages: + for expected_content in expected_message["content"]: + expected_content["text"] = expected_message_texts[expected_content["text"]] + assert len(agent.memory.steps) == 1 + planning_step = agent.memory.steps[0] + assert isinstance(planning_step, PlanningStep) + expected_model_input_messages = expected_messages_list[0] + model_input_messages = planning_step.model_input_messages + assert isinstance(model_input_messages, list) + assert len(model_input_messages) == len(expected_model_input_messages) # 2 + for message, expected_message in zip(model_input_messages, expected_model_input_messages): + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert message["role"] in MessageRole.__members__.values() + assert message["role"] == expected_message["role"] + assert isinstance(message["content"], list) + assert len(message["content"]) == 1 + for content, expected_content in zip(message["content"], expected_message["content"]): + assert content == expected_content + # Test calls to model + assert len(fake_model.call_args_list) == 2 + for call_args, expected_messages in zip(fake_model.call_args_list, expected_messages_list): + assert len(call_args.args) == 1 + messages = call_args.args[0] + assert isinstance(messages, list) + assert len(messages) == len(expected_messages) + for message, expected_message in zip(messages, expected_messages): + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert message["role"] in MessageRole.__members__.values() + assert message["role"] == expected_message["role"] + assert isinstance(message["content"], list) + assert len(message["content"]) == 1 + for content, expected_content in zip(message["content"], expected_message["content"]): + assert content == expected_content + + @pytest.mark.parametrize( + "images, expected_messages_list", + [ + ( + None, + [ + [ + { + "role": MessageRole.SYSTEM, + "content": [{"type": "text", "text": "FINAL_ANSWER_SYSTEM_PROMPT"}], + }, + {"role": MessageRole.USER, "content": [{"type": "text", "text": "FINAL_ANSWER_USER_PROMPT"}]}, + ] + ], + ), + ( + ["image1.png"], + [ + [ + { + "role": MessageRole.SYSTEM, + "content": [{"type": "text", "text": "FINAL_ANSWER_SYSTEM_PROMPT"}, {"type": "image"}], + }, + {"role": MessageRole.USER, "content": [{"type": "text", "text": "FINAL_ANSWER_USER_PROMPT"}]}, + ] + ], + ), + ], + ) + def test_provide_final_answer(self, images, expected_messages_list): + fake_model = MagicMock() + fake_model.return_value.content = "Final answer." + agent = CodeAgent( + tools=[], + model=fake_model, + ) + task = "Test task" + final_answer = agent.provide_final_answer(task, images=images) + expected_message_texts = { + "FINAL_ANSWER_SYSTEM_PROMPT": agent.prompt_templates["final_answer"]["pre_messages"], + "FINAL_ANSWER_USER_PROMPT": populate_template( + agent.prompt_templates["final_answer"]["post_messages"], variables=dict(task=task) + ), + } + for expected_messages in expected_messages_list: + for expected_message in expected_messages: + for expected_content in expected_message["content"]: + if "text" in expected_content: + expected_content["text"] = expected_message_texts[expected_content["text"]] + assert final_answer == "Final answer." + # Test calls to model + assert len(fake_model.call_args_list) == 1 + for call_args, expected_messages in zip(fake_model.call_args_list, expected_messages_list): + assert len(call_args.args) == 1 + messages = call_args.args[0] + assert isinstance(messages, list) + assert len(messages) == len(expected_messages) + for message, expected_message in zip(messages, expected_messages): + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert message["role"] in MessageRole.__members__.values() + assert message["role"] == expected_message["role"] + assert isinstance(message["content"], list) + assert len(message["content"]) == len(expected_message["content"]) + for content, expected_content in zip(message["content"], expected_message["content"]): + assert content == expected_content + + +class TestCodeAgent: + @pytest.mark.parametrize("provide_run_summary", [False, True]) + def test_call_with_provide_run_summary(self, provide_run_summary): + agent = CodeAgent(tools=[], model=MagicMock(), provide_run_summary=provide_run_summary) + assert agent.provide_run_summary is provide_run_summary + agent.managed_agent_prompt = "Task: {task}" + agent.name = "test_agent" + agent.run = MagicMock(return_value="Test output") + agent.write_memory_to_messages = MagicMock(return_value=[{"content": "Test summary"}]) + + result = agent("Test request") + expected_summary = "Here is the final answer from your managed agent 'test_agent':\nTest output" + if provide_run_summary: + expected_summary += ( + "\n\nFor more detail, find below a summary of this agent's work:\n" + "\n\nTest summary\n---\n" + ) + assert result == expected_summary + + +class MultiAgentsTests(unittest.TestCase): + def test_multiagents_save(self): + model = HfApiModel("Qwen/Qwen2.5-Coder-32B-Instruct", max_tokens=2096, temperature=0.5) + + web_agent = ToolCallingAgent( + model=model, + tools=[DuckDuckGoSearchTool(max_results=2), VisitWebpageTool()], + name="web_agent", + description="does web searches", + ) + code_agent = CodeAgent(model=model, tools=[], name="useless", description="does nothing in particular") + + agent = CodeAgent( + model=model, + tools=[], + additional_authorized_imports=["pandas", "datetime"], + managed_agents=[web_agent, code_agent], + max_print_outputs_length=1000, + ) + agent.save("agent_export") + + expected_structure = { + "managed_agents": { + "useless": {"tools": {"files": ["final_answer.py"]}, "files": ["agent.json", "prompts.yaml"]}, + "web_agent": { + "tools": {"files": ["final_answer.py", "visit_webpage.py", "web_search.py"]}, + "files": ["agent.json", "prompts.yaml"], + }, + }, + "tools": {"files": ["final_answer.py"]}, + "files": ["app.py", "requirements.txt", "agent.json", "prompts.yaml"], + } + + def verify_structure(current_path: Path, structure: dict): + for dir_name, contents in structure.items(): + if dir_name != "files": + # For directories, verify they exist and recurse into them + dir_path = current_path / dir_name + assert dir_path.exists(), f"Directory {dir_path} does not exist" + assert dir_path.is_dir(), f"{dir_path} is not a directory" + verify_structure(dir_path, contents) + else: + # For files, verify each exists in the current path + for file_name in contents: + file_path = current_path / file_name + assert file_path.exists(), f"File {file_path} does not exist" + assert file_path.is_file(), f"{file_path} is not a file" + + verify_structure(Path("agent_export"), expected_structure) + + # Test that re-loaded agents work as expected. + agent2 = CodeAgent.from_folder("agent_export", planning_interval=5) + assert agent2.planning_interval == 5 # Check that kwargs are used + assert set(agent2.authorized_imports) == set(["pandas", "datetime"] + BASE_BUILTIN_MODULES) + assert agent2.max_print_outputs_length == 1000 + assert agent2.use_e2b_executor is False + assert ( + agent2.managed_agents["web_agent"].tools["web_search"].max_results == 10 + ) # For now tool init parameters are forgotten + assert agent2.model.kwargs["temperature"] == pytest.approx(0.5) def test_multiagents(self): class FakeModelMultiagentsManagerAgent: + model_id = "fake_model" + def __call__( self, messages, @@ -548,6 +912,8 @@ def __call__( manager_model = FakeModelMultiagentsManagerAgent() class FakeModelMultiagentsManagedAgent: + model_id = "fake_model" + def __call__( self, messages, @@ -576,10 +942,6 @@ def __call__( tools=[], model=managed_model, max_steps=10, - ) - - managed_web_agent = ManagedAgent( - agent=web_agent, name="search_agent", description="Runs web searches for you. Give it your request as an argument. Make the request as detailed as needed, you can ask for thorough reports", ) @@ -587,7 +949,7 @@ def __call__( manager_code_agent = CodeAgent( tools=[], model=manager_model, - managed_agents=[managed_web_agent], + managed_agents=[web_agent], additional_authorized_imports=["time", "numpy", "pandas"], ) @@ -597,26 +959,19 @@ def __call__( manager_toolcalling_agent = ToolCallingAgent( tools=[], model=manager_model, - managed_agents=[managed_web_agent], + managed_agents=[web_agent], ) report = manager_toolcalling_agent.run("Fake question.") assert report == "Final report." - def test_code_nontrivial_final_answer_works(self): - def fake_code_model_final_answer(messages, stop_sequences=None, grammar=None): - return ChatMessage( - role="assistant", - content="""Code: -```py -def nested_answer(): - final_answer("Correct!") - -nested_answer() -```""", - ) + # Test that visualization works + manager_code_agent.visualize() - agent = CodeAgent(tools=[], model=fake_code_model_final_answer) - output = agent.run("Count to 3") - assert output == "Correct!" +@pytest.fixture +def prompt_templates(): + return { + "system_prompt": "This is a test system prompt.", + "managed_agent": {"task": "Task for {{name}}: {{task}}", "report": "Report for {{name}}: {{final_answer}}"}, + } diff --git a/tests/test_all_docs.py b/tests/test_all_docs.py index 68a88d369..0786e9138 100644 --- a/tests/test_all_docs.py +++ b/tests/test_all_docs.py @@ -26,6 +26,8 @@ import pytest from dotenv import load_dotenv +from .utils.markers import require_run_all + class SubprocessCallException(Exception): pass @@ -78,6 +80,7 @@ def create_test_script(code_blocks: List[str], tmp_dir: str) -> Path: return tmp_file +@require_run_all class TestDocs: """Test case for documentation code testing.""" @@ -93,7 +96,7 @@ def setup_class(cls): load_dotenv() - cls.md_files = list(cls.docs_dir.rglob("*.md")) + cls.md_files = list(cls.docs_dir.rglob("*.mdx")) if not cls.md_files: raise ValueError(f"No markdown files found in {cls.docs_dir}") diff --git a/tests/test_default_tools.py b/tests/test_default_tools.py index 91c40c6a5..5ff436ef3 100644 --- a/tests/test_default_tools.py +++ b/tests/test_default_tools.py @@ -16,8 +16,8 @@ import pytest -from smolagents.default_tools import PythonInterpreterTool, VisitWebpageTool -from smolagents.types import _AGENT_TYPE_MAPPING +from smolagents.agent_types import _AGENT_TYPE_MAPPING +from smolagents.default_tools import DuckDuckGoSearchTool, PythonInterpreterTool, SpeechToTextTool, VisitWebpageTool from .test_tools import ToolTesterMixin @@ -29,6 +29,10 @@ def test_visit_webpage(self): assert isinstance(result, str) assert "* [About Wikipedia](/wiki/Wikipedia:About)" in result # Proper wikipedia pages have an About + def test_ddgs_with_kwargs(self): + result = DuckDuckGoSearchTool(timeout=20)("DeepSeek parent company") + assert isinstance(result, str) + class PythonInterpreterToolTester(unittest.TestCase, ToolTesterMixin): def setUp(self): @@ -73,3 +77,13 @@ def test_unauthorized_imports_fail(self): with pytest.raises(Exception) as e: self.tool("import sympy as sp") assert "sympy" in str(e).lower() + + +class TestSpeechToTextTool: + def test_new_instance(self): + from transformers.models.whisper import WhisperForConditionalGeneration, WhisperProcessor + + tool = SpeechToTextTool() + assert tool is not None + assert tool.pre_processor_class == WhisperProcessor + assert tool.model_class == WhisperForConditionalGeneration diff --git a/tests/test_e2b_executor.py b/tests/test_e2b_executor.py new file mode 100644 index 000000000..5994a44be --- /dev/null +++ b/tests/test_e2b_executor.py @@ -0,0 +1,18 @@ +from unittest.mock import MagicMock, patch + +from smolagents.e2b_executor import E2BExecutor + + +class TestE2BExecutor: + def test_e2b_executor_instantiation(self): + logger = MagicMock() + with patch("e2b_code_interpreter.Sandbox") as mock_sandbox: + mock_sandbox.return_value.commands.run.return_value.error = None + mock_sandbox.return_value.run_code.return_value.error = None + executor = E2BExecutor(additional_imports=[], tools=[], logger=logger) + assert isinstance(executor, E2BExecutor) + assert executor.logger == logger + assert executor.final_answer is False + assert executor.custom_tools == {} + assert executor.final_answer_pattern.pattern == r"final_answer\((.*?)\)" + assert executor.sbx == mock_sandbox.return_value diff --git a/tests/test_final_answer.py b/tests/test_final_answer.py index 7bb1e5efe..fcfb02a3f 100644 --- a/tests/test_final_answer.py +++ b/tests/test_final_answer.py @@ -21,8 +21,8 @@ from transformers import is_torch_available from transformers.testing_utils import get_tests_dir, require_torch +from smolagents.agent_types import _AGENT_TYPE_MAPPING from smolagents.default_tools import FinalAnswerTool -from smolagents.types import _AGENT_TYPE_MAPPING from .test_tools import ToolTesterMixin diff --git a/tests/test_function_type_hints_utils.py b/tests/test_function_type_hints_utils.py index 9e5898516..3379237c6 100644 --- a/tests/test_function_type_hints_utils.py +++ b/tests/test_function_type_hints_utils.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import unittest -from typing import Optional, Tuple +from typing import List, Optional, Tuple -from smolagents._function_type_hints_utils import get_json_schema +import pytest +from smolagents._function_type_hints_utils import get_imports, get_json_schema -class AgentTextTests(unittest.TestCase): - def test_return_none(self): + +class TestJsonSchema(unittest.TestCase): + def test_get_json_schema(self): def fn(x: int, y: Optional[Tuple[str, str, float]] = None) -> None: """ Test function @@ -52,3 +54,65 @@ def fn(x: int, y: Optional[Tuple[str, str, float]] = None) -> None: schema["function"]["parameters"]["properties"]["y"], expected_schema["parameters"]["properties"]["y"] ) self.assertEqual(schema["function"], expected_schema) + + +class TestGetCode: + @pytest.mark.parametrize( + "code, expected", + [ + ( + """ + import numpy + import pandas + """, + ["numpy", "pandas"], + ), + # From imports + ( + """ + from torch import nn + from transformers import AutoModel + """, + ["torch", "transformers"], + ), + # Mixed case with nested imports + ( + """ + import numpy as np + from torch.nn import Linear + import os.path + """, + ["numpy", "torch", "os"], + ), + # Try/except block (should be filtered) + ( + """ + try: + import torch + except ImportError: + pass + import numpy + """, + ["numpy"], + ), + # Flash attention block (should be filtered) + ( + """ + if is_flash_attn_2_available(): + from flash_attn import flash_attn_func + import transformers + """, + ["transformers"], + ), + # Relative imports (should be excluded) + ( + """ + from .utils import helper + from ..models import transformer + """, + [], + ), + ], + ) + def test_get_imports(self, code: str, expected: List[str]): + assert sorted(get_imports(code)) == sorted(expected) diff --git a/tests/test_gradio_ui.py b/tests/test_gradio_ui.py new file mode 100644 index 000000000..0b337d29b --- /dev/null +++ b/tests/test_gradio_ui.py @@ -0,0 +1,125 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile +import unittest +from unittest.mock import Mock, patch + +from smolagents.gradio_ui import GradioUI + + +class GradioUITester(unittest.TestCase): + def setUp(self): + """Initialize test environment""" + self.temp_dir = tempfile.mkdtemp() + self.mock_agent = Mock() + self.ui = GradioUI(agent=self.mock_agent, file_upload_folder=self.temp_dir) + self.allowed_types = [".pdf", ".docx", ".txt"] + + def tearDown(self): + """Clean up test environment""" + shutil.rmtree(self.temp_dir) + + def test_upload_file_default_types(self): + """Test default allowed file types""" + default_types = [".pdf", ".docx", ".txt"] + for file_type in default_types: + with tempfile.NamedTemporaryFile(suffix=file_type) as temp_file: + mock_file = Mock() + mock_file.name = temp_file.name + + textbox, uploads_log = self.ui.upload_file(mock_file, []) + + self.assertIn("File uploaded:", textbox.value) + self.assertEqual(len(uploads_log), 1) + self.assertTrue(os.path.exists(os.path.join(self.temp_dir, os.path.basename(temp_file.name)))) + + def test_upload_file_default_types_disallowed(self): + """Test default disallowed file types""" + disallowed_types = [".exe", ".sh", ".py", ".jpg"] + for file_type in disallowed_types: + with tempfile.NamedTemporaryFile(suffix=file_type) as temp_file: + mock_file = Mock() + mock_file.name = temp_file.name + + textbox, uploads_log = self.ui.upload_file(mock_file, []) + + self.assertEqual(textbox.value, "File type disallowed") + self.assertEqual(len(uploads_log), 0) + + def test_upload_file_success(self): + """Test successful file upload scenario""" + with tempfile.NamedTemporaryFile(suffix=".txt") as temp_file: + mock_file = Mock() + mock_file.name = temp_file.name + + textbox, uploads_log = self.ui.upload_file(mock_file, []) + + self.assertIn("File uploaded:", textbox.value) + self.assertEqual(len(uploads_log), 1) + self.assertTrue(os.path.exists(os.path.join(self.temp_dir, os.path.basename(temp_file.name)))) + self.assertEqual(uploads_log[0], os.path.join(self.temp_dir, os.path.basename(temp_file.name))) + + def test_upload_file_none(self): + """Test scenario when no file is selected""" + textbox, uploads_log = self.ui.upload_file(None, []) + + self.assertEqual(textbox.value, "No file uploaded") + self.assertEqual(len(uploads_log), 0) + + def test_upload_file_invalid_type(self): + """Test disallowed file type""" + with tempfile.NamedTemporaryFile(suffix=".exe") as temp_file: + mock_file = Mock() + mock_file.name = temp_file.name + + textbox, uploads_log = self.ui.upload_file(mock_file, []) + + self.assertEqual(textbox.value, "File type disallowed") + self.assertEqual(len(uploads_log), 0) + + def test_upload_file_special_chars(self): + """Test scenario with special characters in filename""" + with tempfile.NamedTemporaryFile(suffix=".txt") as temp_file: + # Create a new temporary file with special characters + special_char_name = os.path.join(os.path.dirname(temp_file.name), "test@#$%^&*.txt") + shutil.copy(temp_file.name, special_char_name) + try: + mock_file = Mock() + mock_file.name = special_char_name + + with patch("shutil.copy"): + textbox, uploads_log = self.ui.upload_file(mock_file, []) + + self.assertIn("File uploaded:", textbox.value) + self.assertEqual(len(uploads_log), 1) + self.assertIn("test_____", uploads_log[0]) + finally: + # Clean up the special character file + if os.path.exists(special_char_name): + os.remove(special_char_name) + + def test_upload_file_custom_types(self): + """Test custom allowed file types""" + with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file: + mock_file = Mock() + mock_file.name = temp_file.name + + textbox, uploads_log = self.ui.upload_file(mock_file, [], allowed_file_types=[".csv"]) + + self.assertIn("File uploaded:", textbox.value) + self.assertEqual(len(uploads_log), 1) diff --git a/tests/test_import.py b/tests/test_import.py new file mode 100644 index 000000000..aaa284d39 --- /dev/null +++ b/tests/test_import.py @@ -0,0 +1,15 @@ +import subprocess + + +def test_import_smolagents_without_extras(): + # Run the import statement in an isolated virtual environment + result = subprocess.run( + ["uv", "run", "--isolated", "--no-editable", "-"], input="import smolagents", text=True, capture_output=True + ) + # Check if the import was successful + assert result.returncode == 0, ( + "Import failed with error: " + + (result.stderr.splitlines()[-1] if result.stderr else "No error message") + + "\n" + + result.stderr + ) diff --git a/tests/test_python_interpreter.py b/tests/test_local_python_executor.py similarity index 71% rename from tests/test_python_interpreter.py rename to tests/test_local_python_executor.py index 8aec8fe31..29e1ec94c 100644 --- a/tests/test_python_interpreter.py +++ b/tests/test_local_python_executor.py @@ -13,17 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import ast +import types import unittest from textwrap import dedent import numpy as np +import pandas as pd import pytest from smolagents.default_tools import BASE_PYTHON_TOOLS from smolagents.local_python_executor import ( InterpreterError, + PrintContainer, + check_module_authorized, + evaluate_condition, + evaluate_delete, evaluate_python_code, fix_final_answer_code, + get_safe_module, ) @@ -33,19 +41,25 @@ def add_two(x): class PythonInterpreterTester(unittest.TestCase): + def assertDictEqualNoPrint(self, dict1, dict2): + return self.assertDictEqual( + {k: v for k, v in dict1.items() if k != "_print_outputs"}, + {k: v for k, v in dict2.items() if k != "_print_outputs"}, + ) + def test_evaluate_assign(self): code = "x = 3" state = {} result, _ = evaluate_python_code(code, {}, state=state) assert result == 3 - self.assertDictEqual(state, {"x": 3, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "_operations_count": 2}) code = "x = y" state = {"y": 5} result, _ = evaluate_python_code(code, {}, state=state) # evaluate returns the value of the last assignment. assert result == 5 - self.assertDictEqual(state, {"x": 5, "y": 5, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 5, "y": 5, "_operations_count": 2}) code = "a=1;b=None" result, _ = evaluate_python_code(code, {}, state={}) @@ -71,7 +85,7 @@ def test_evaluate_call(self): state = {"x": 3} result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state) assert result == 5 - self.assertDictEqual(state, {"x": 3, "y": 5, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "y": 5, "_operations_count": 3}) # Should not work without the tool with pytest.raises(InterpreterError) as e: @@ -83,14 +97,14 @@ def test_evaluate_constant(self): state = {} result, _ = evaluate_python_code(code, {}, state=state) assert result == 3 - self.assertDictEqual(state, {"x": 3, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "_operations_count": 2}) def test_evaluate_dict(self): code = "test_dict = {'x': x, 'y': add_two(x)}" state = {"x": 3} result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state) self.assertDictEqual(result, {"x": 3, "y": 5}) - self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "_operations_count": 7}) def test_evaluate_expression(self): code = "x = 3\ny = 5" @@ -98,7 +112,7 @@ def test_evaluate_expression(self): result, _ = evaluate_python_code(code, {}, state=state) # evaluate returns the value of the last assignment. assert result == 5 - self.assertDictEqual(state, {"x": 3, "y": 5, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "y": 5, "_operations_count": 4}) def test_evaluate_f_string(self): code = "text = f'This is x: {x}.'" @@ -106,7 +120,23 @@ def test_evaluate_f_string(self): result, _ = evaluate_python_code(code, {}, state=state) # evaluate returns the value of the last assignment. assert result == "This is x: 3." - self.assertDictEqual(state, {"x": 3, "text": "This is x: 3.", "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "text": "This is x: 3.", "_operations_count": 6}) + + def test_evaluate_f_string_with_format(self): + code = "text = f'This is x: {x:.2f}.'" + state = {"x": 3.336} + result, _ = evaluate_python_code(code, {}, state=state) + assert result == "This is x: 3.34." + self.assertDictEqualNoPrint(state, {"x": 3.336, "text": "This is x: 3.34.", "_operations_count": 8}) + + def test_evaluate_f_string_with_complex_format(self): + code = "text = f'This is x: {x:>{width}.{precision}f}.'" + state = {"x": 3.336, "width": 10, "precision": 2} + result, _ = evaluate_python_code(code, {}, state=state) + assert result == "This is x: 3.34." + self.assertDictEqualNoPrint( + state, {"x": 3.336, "width": 10, "precision": 2, "text": "This is x: 3.34.", "_operations_count": 14} + ) def test_evaluate_if(self): code = "if x <= 3:\n y = 2\nelse:\n y = 5" @@ -114,40 +144,40 @@ def test_evaluate_if(self): result, _ = evaluate_python_code(code, {}, state=state) # evaluate returns the value of the last assignment. assert result == 2 - self.assertDictEqual(state, {"x": 3, "y": 2, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "y": 2, "_operations_count": 6}) state = {"x": 8} result, _ = evaluate_python_code(code, {}, state=state) # evaluate returns the value of the last assignment. assert result == 5 - self.assertDictEqual(state, {"x": 8, "y": 5, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 8, "y": 5, "_operations_count": 6}) def test_evaluate_list(self): code = "test_list = [x, add_two(x)]" state = {"x": 3} result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state) self.assertListEqual(result, [3, 5]) - self.assertDictEqual(state, {"x": 3, "test_list": [3, 5], "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "test_list": [3, 5], "_operations_count": 5}) def test_evaluate_name(self): code = "y = x" state = {"x": 3} result, _ = evaluate_python_code(code, {}, state=state) assert result == 3 - self.assertDictEqual(state, {"x": 3, "y": 3, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "y": 3, "_operations_count": 2}) def test_evaluate_subscript(self): code = "test_list = [x, add_two(x)]\ntest_list[1]" state = {"x": 3} result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state) assert result == 5 - self.assertDictEqual(state, {"x": 3, "test_list": [3, 5], "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "test_list": [3, 5], "_operations_count": 9}) code = "test_dict = {'x': x, 'y': add_two(x)}\ntest_dict['y']" state = {"x": 3} result, _ = evaluate_python_code(code, {"add_two": add_two}, state=state) assert result == 5 - self.assertDictEqual(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "test_dict": {"x": 3, "y": 5}, "_operations_count": 11}) code = "vendor = {'revenue': 31000, 'rent': 50312}; vendor['ratio'] = round(vendor['revenue'] / vendor['rent'], 2)" state = {} @@ -171,14 +201,14 @@ def test_evaluate_for(self): state = {} result, _ = evaluate_python_code(code, {"range": range}, state=state) assert result == 2 - self.assertDictEqual(state, {"x": 2, "i": 2, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 2, "i": 2, "_operations_count": 11}) def test_evaluate_binop(self): code = "y + x" state = {"x": 3, "y": 6} result, _ = evaluate_python_code(code, {}, state=state) assert result == 9 - self.assertDictEqual(state, {"x": 3, "y": 6, "print_outputs": ""}) + self.assertDictEqualNoPrint(state, {"x": 3, "y": 6, "_operations_count": 4}) def test_recursive_function(self): code = """ @@ -375,7 +405,7 @@ def test_if_conditions(self): print('2')""" state = {} evaluate_python_code(code, BASE_PYTHON_TOOLS, state=state) - assert state["print_outputs"] == "2\n" + assert state["_print_outputs"].value == "2\n" def test_imports(self): code = "import math\nmath.sqrt(4)" @@ -454,9 +484,9 @@ def test_print_output(self): state = {} result, _ = evaluate_python_code(code, BASE_PYTHON_TOOLS, state=state) assert result is None - assert state["print_outputs"] == "Hello world!\nOk no one cares\n" + assert state["_print_outputs"].value == "Hello world!\nOk no one cares\n" - # test print in function + # Test print in function (state copy) code = """ print("1") def function(): @@ -464,7 +494,17 @@ def function(): function()""" state = {} evaluate_python_code(code, {"print": print}, state=state) - assert state["print_outputs"] == "1\n2\n" + assert state["_print_outputs"].value == "1\n2\n" + + # Test print in list comprehension (state copy) + code = """ +print("1") +def function(): + print("2") +[function() for i in range(10)]""" + state = {} + evaluate_python_code(code, {"print": print, "range": range}, state=state) + assert state["_print_outputs"].value == "1\n2\n2\n2\n2\n2\n2\n2\n2\n2\n2\n" def test_tuple_target_in_iterator(self): code = "for a, b in [('Ralf Weikert', 'Austria'), ('Samuel Seungwon Lee', 'South Korea')]:res = a.split()[0]" @@ -586,7 +626,7 @@ def test_print(self): code = "print(min([1, 2, 3]))" state = {} evaluate_python_code(code, {"min": min, "print": print}, state=state) - assert state["print_outputs"] == "1\n" + assert state["_print_outputs"].value == "1\n" def test_types_as_objects(self): code = "type_a = float(2); type_b = str; type_c = int" @@ -907,7 +947,7 @@ def test_dangerous_subpackage_access_blocked(self): code = "import random;random._os.system('echo bad command passed')" with pytest.raises(InterpreterError) as e: evaluate_python_code(code) - assert "AttributeError:module 'random' has no attribute '_os'" in str(e) + assert "AttributeError: module 'random' has no attribute '_os'" in str(e) code = "import doctest;doctest.inspect.os.system('echo bad command passed')" with pytest.raises(InterpreterError): @@ -950,6 +990,14 @@ def test_dangerous_builtins_are_callable_if_explicitly_added(self): dangerous_code, static_tools={"tcompile": compile, "teval": eval, "texec": exec} | BASE_PYTHON_TOOLS ) + def test_can_import_os_if_explicitly_authorized(self): + dangerous_code = "import os; os.listdir('./')" + evaluate_python_code(dangerous_code, authorized_imports=["os"]) + + def test_can_import_os_if_all_imports_authorized(self): + dangerous_code = "import os; os.listdir('./')" + evaluate_python_code(dangerous_code, authorized_imports=["*"]) + @pytest.mark.parametrize( "code, expected_result", @@ -1065,3 +1113,284 @@ def __{operator_name}__(self, other): state = {} result, _ = evaluate_python_code(code, {}, state=state) assert result == expected_result + + +@pytest.mark.parametrize( + "code, expected_error_message", + [ + ( + dedent("""\ + x = 5 + del x + x + """), + "The variable `x` is not defined", + ), + ( + dedent("""\ + x = [1, 2, 3] + del x[2] + x[2] + """), + "Index 2 out of bounds for list of length 2", + ), + ( + dedent("""\ + x = {"key": "value"} + del x["key"] + x["key"] + """), + "Could not index {} with 'key'", + ), + ( + dedent("""\ + del x + """), + "Cannot delete name 'x': name is not defined", + ), + ], +) +def test_evaluate_python_code_with_evaluate_delete(code, expected_error_message): + state = {} + with pytest.raises(InterpreterError) as exception_info: + evaluate_python_code(code, {}, state=state) + assert expected_error_message in str(exception_info.value) + + +@pytest.mark.parametrize( + "code, state, expectation", + [ + ("del x", {"x": 1}, {}), + ("del x[1]", {"x": [1, 2, 3]}, {"x": [1, 3]}), + ("del x['key']", {"x": {"key": "value"}}, {"x": {}}), + ("del x", {}, InterpreterError("Cannot delete name 'x': name is not defined")), + ], +) +def test_evaluate_delete(code, state, expectation): + delete_node = ast.parse(code).body[0] + if isinstance(expectation, Exception): + with pytest.raises(type(expectation)) as exception_info: + evaluate_delete(delete_node, state, {}, {}, []) + assert str(expectation) in str(exception_info.value) + else: + evaluate_delete(delete_node, state, {}, {}, []) + _ = state.pop("_operations_count", None) + assert state == expectation + + +@pytest.mark.parametrize( + "condition, state, expected_result", + [ + ("a == b", {"a": 1, "b": 1}, True), + ("a == b", {"a": 1, "b": 2}, False), + ("a != b", {"a": 1, "b": 1}, False), + ("a != b", {"a": 1, "b": 2}, True), + ("a < b", {"a": 1, "b": 1}, False), + ("a < b", {"a": 1, "b": 2}, True), + ("a < b", {"a": 2, "b": 1}, False), + ("a <= b", {"a": 1, "b": 1}, True), + ("a <= b", {"a": 1, "b": 2}, True), + ("a <= b", {"a": 2, "b": 1}, False), + ("a > b", {"a": 1, "b": 1}, False), + ("a > b", {"a": 1, "b": 2}, False), + ("a > b", {"a": 2, "b": 1}, True), + ("a >= b", {"a": 1, "b": 1}, True), + ("a >= b", {"a": 1, "b": 2}, False), + ("a >= b", {"a": 2, "b": 1}, True), + ("a is b", {"a": 1, "b": 1}, True), + ("a is b", {"a": 1, "b": 2}, False), + ("a is not b", {"a": 1, "b": 1}, False), + ("a is not b", {"a": 1, "b": 2}, True), + ("a in b", {"a": 1, "b": [1, 2, 3]}, True), + ("a in b", {"a": 4, "b": [1, 2, 3]}, False), + ("a not in b", {"a": 1, "b": [1, 2, 3]}, False), + ("a not in b", {"a": 4, "b": [1, 2, 3]}, True), + # Chained conditions: + ("a == b == c", {"a": 1, "b": 1, "c": 1}, True), + ("a == b == c", {"a": 1, "b": 2, "c": 1}, False), + ("a == b < c", {"a": 2, "b": 2, "c": 2}, False), + ("a == b < c", {"a": 0, "b": 0, "c": 1}, True), + ], +) +def test_evaluate_condition(condition, state, expected_result): + condition_ast = ast.parse(condition, mode="eval").body + result = evaluate_condition(condition_ast, state, {}, {}, []) + assert result == expected_result + + +@pytest.mark.parametrize( + "condition, state, expected_result", + [ + ("a == b", {"a": pd.Series([1, 2, 3]), "b": pd.Series([2, 2, 2])}, pd.Series([False, True, False])), + ("a != b", {"a": pd.Series([1, 2, 3]), "b": pd.Series([2, 2, 2])}, pd.Series([True, False, True])), + ("a < b", {"a": pd.Series([1, 2, 3]), "b": pd.Series([2, 2, 2])}, pd.Series([True, False, False])), + ("a <= b", {"a": pd.Series([1, 2, 3]), "b": pd.Series([2, 2, 2])}, pd.Series([True, True, False])), + ("a > b", {"a": pd.Series([1, 2, 3]), "b": pd.Series([2, 2, 2])}, pd.Series([False, False, True])), + ("a >= b", {"a": pd.Series([1, 2, 3]), "b": pd.Series([2, 2, 2])}, pd.Series([False, True, True])), + ( + "a == b", + {"a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), "b": pd.DataFrame({"x": [1, 2], "y": [3, 5]})}, + pd.DataFrame({"x": [True, True], "y": [True, False]}), + ), + ( + "a != b", + {"a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), "b": pd.DataFrame({"x": [1, 2], "y": [3, 5]})}, + pd.DataFrame({"x": [False, False], "y": [False, True]}), + ), + ( + "a < b", + {"a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), "b": pd.DataFrame({"x": [2, 2], "y": [2, 2]})}, + pd.DataFrame({"x": [True, False], "y": [False, False]}), + ), + ( + "a <= b", + {"a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), "b": pd.DataFrame({"x": [2, 2], "y": [2, 2]})}, + pd.DataFrame({"x": [True, True], "y": [False, False]}), + ), + ( + "a > b", + {"a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), "b": pd.DataFrame({"x": [2, 2], "y": [2, 2]})}, + pd.DataFrame({"x": [False, False], "y": [True, True]}), + ), + ( + "a >= b", + {"a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), "b": pd.DataFrame({"x": [2, 2], "y": [2, 2]})}, + pd.DataFrame({"x": [False, True], "y": [True, True]}), + ), + ], +) +def test_evaluate_condition_with_pandas(condition, state, expected_result): + condition_ast = ast.parse(condition, mode="eval").body + result = evaluate_condition(condition_ast, state, {}, {}, []) + if isinstance(result, pd.Series): + pd.testing.assert_series_equal(result, expected_result) + else: + pd.testing.assert_frame_equal(result, expected_result) + + +@pytest.mark.parametrize( + "condition, state, expected_exception", + [ + # Chained conditions: + ( + "a == b == c", + { + "a": pd.Series([1, 2, 3]), + "b": pd.Series([2, 2, 2]), + "c": pd.Series([3, 3, 3]), + }, + ValueError( + "The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ), + ), + ( + "a == b == c", + { + "a": pd.DataFrame({"x": [1, 2], "y": [3, 4]}), + "b": pd.DataFrame({"x": [2, 2], "y": [2, 2]}), + "c": pd.DataFrame({"x": [3, 3], "y": [3, 3]}), + }, + ValueError( + "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ), + ), + ], +) +def test_evaluate_condition_with_pandas_exceptions(condition, state, expected_exception): + condition_ast = ast.parse(condition, mode="eval").body + with pytest.raises(type(expected_exception)) as exception_info: + _ = evaluate_condition(condition_ast, state, {}, {}, []) + assert str(expected_exception) in str(exception_info.value) + + +def test_get_safe_module_handle_lazy_imports(): + class FakeModule(types.ModuleType): + def __init__(self, name): + super().__init__(name) + self.non_lazy_attribute = "ok" + + def __getattr__(self, name): + if name == "lazy_attribute": + raise ImportError("lazy import failure") + return super().__getattr__(name) + + def __dir__(self): + return super().__dir__() + ["lazy_attribute"] + + fake_module = FakeModule("fake_module") + safe_module = get_safe_module(fake_module, authorized_imports=set()) + assert not hasattr(safe_module, "lazy_attribute") + assert getattr(safe_module, "non_lazy_attribute") == "ok" + + +def test_non_standard_comparisons(): + code = dedent("""\ + class NonStdEqualsResult: + def __init__(self, left:object, right:object): + self._left = left + self._right = right + def __str__(self) -> str: + return f'{self._left} == {self._right}' + + class NonStdComparisonClass: + def __init__(self, value: str ): + self._value = value + def __str__(self): + return self._value + def __eq__(self, other): + return NonStdEqualsResult(self, other) + a = NonStdComparisonClass("a") + b = NonStdComparisonClass("b") + result = a == b + """) + result, _ = evaluate_python_code(code, state={}) + assert not isinstance(result, bool) + assert str(result) == "a == b" + + +class TestPrintContainer: + def test_initial_value(self): + pc = PrintContainer() + assert pc.value == "" + + def test_append(self): + pc = PrintContainer() + pc.append("Hello") + assert pc.value == "Hello" + + def test_iadd(self): + pc = PrintContainer() + pc += "World" + assert pc.value == "World" + + def test_str(self): + pc = PrintContainer() + pc.append("Hello") + assert str(pc) == "Hello" + + def test_repr(self): + pc = PrintContainer() + pc.append("Hello") + assert repr(pc) == "PrintContainer(Hello)" + + def test_len(self): + pc = PrintContainer() + pc.append("Hello") + assert len(pc) == 5 + + +@pytest.mark.parametrize( + "module,authorized_imports,expected", + [ + ("os", ["*"], True), + ("AnyModule", ["*"], True), + ("os", ["os"], True), + ("AnyModule", ["AnyModule"], True), + ("Module.os", ["Module"], False), + ("Module.os", ["Module", "os"], True), + ("os.path", ["os"], True), + ("os", ["os.path"], False), + ], +) +def test_check_module_authorized(module: str, authorized_imports: list[str], expected: bool): + assert check_module_authorized(module, authorized_imports) == expected diff --git a/tests/test_memory.py b/tests/test_memory.py new file mode 100644 index 000000000..c007a185c --- /dev/null +++ b/tests/test_memory.py @@ -0,0 +1,153 @@ +import pytest + +from smolagents.agents import ToolCall +from smolagents.memory import ( + ActionStep, + AgentMemory, + ChatMessage, + MemoryStep, + Message, + MessageRole, + PlanningStep, + SystemPromptStep, + TaskStep, +) + + +class TestAgentMemory: + def test_initialization(self): + system_prompt = "This is a system prompt." + memory = AgentMemory(system_prompt=system_prompt) + assert memory.system_prompt.system_prompt == system_prompt + assert memory.steps == [] + + +class TestMemoryStep: + def test_initialization(self): + step = MemoryStep() + assert isinstance(step, MemoryStep) + + def test_dict(self): + step = MemoryStep() + assert step.dict() == {} + + def test_to_messages(self): + step = MemoryStep() + with pytest.raises(NotImplementedError): + step.to_messages() + + +def test_action_step_to_messages(): + action_step = ActionStep( + model_input_messages=[Message(role=MessageRole.USER, content="Hello")], + tool_calls=[ + ToolCall(id="id", name="get_weather", arguments={"location": "Paris"}), + ], + start_time=0.0, + end_time=1.0, + step_number=1, + error=None, + duration=1.0, + model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content="Hi"), + model_output="Hi", + observations="This is a nice observation", + observations_images=["image1.png"], + action_output="Output", + ) + messages = action_step.to_messages() + assert len(messages) == 4 + for message in messages: + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert isinstance(message["role"], MessageRole) + assert isinstance(message["content"], list) + assistant_message = messages[0] + assert assistant_message["role"] == MessageRole.ASSISTANT + assert len(assistant_message["content"]) == 1 + for content in assistant_message["content"]: + assert isinstance(content, dict) + assert "type" in content + assert "text" in content + message = messages[1] + assert message["role"] == MessageRole.ASSISTANT + + assert len(message["content"]) == 1 + text_content = message["content"][0] + assert isinstance(text_content, dict) + assert "type" in text_content + assert "text" in text_content + + observation_message = messages[2] + assert observation_message["role"] == MessageRole.TOOL_RESPONSE + assert "Observation:\nThis is a nice observation" in observation_message["content"][0]["text"] + + image_message = messages[3] + image_content = image_message["content"][1] + assert isinstance(image_content, dict) + assert "type" in image_content + assert "image" in image_content + + +def test_planning_step_to_messages(): + planning_step = PlanningStep( + model_input_messages=[Message(role=MessageRole.USER, content="Hello")], + model_output_message_facts=ChatMessage(role=MessageRole.ASSISTANT, content="Facts"), + facts="These are facts.", + model_output_message_plan=ChatMessage(role=MessageRole.ASSISTANT, content="Plan"), + plan="This is a plan.", + ) + messages = planning_step.to_messages(summary_mode=False) + assert len(messages) == 2 + for message in messages: + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert isinstance(message["role"], MessageRole) + assert message["role"] == MessageRole.ASSISTANT + assert isinstance(message["content"], list) + assert len(message["content"]) == 1 + for content in message["content"]: + assert isinstance(content, dict) + assert "type" in content + assert "text" in content + + +def test_task_step_to_messages(): + task_step = TaskStep(task="This is a task.", task_images=["task_image1.png"]) + messages = task_step.to_messages(summary_mode=False) + assert len(messages) == 1 + for message in messages: + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert isinstance(message["role"], MessageRole) + assert message["role"] == MessageRole.USER + assert isinstance(message["content"], list) + assert len(message["content"]) == 2 + text_content = message["content"][0] + assert isinstance(text_content, dict) + assert "type" in text_content + assert "text" in text_content + for image_content in message["content"][1:]: + assert isinstance(image_content, dict) + assert "type" in image_content + assert "image" in image_content + + +def test_system_prompt_step_to_messages(): + system_prompt_step = SystemPromptStep(system_prompt="This is a system prompt.") + messages = system_prompt_step.to_messages(summary_mode=False) + assert len(messages) == 1 + for message in messages: + assert isinstance(message, dict) + assert "role" in message + assert "content" in message + assert isinstance(message["role"], MessageRole) + assert message["role"] == MessageRole.SYSTEM + assert isinstance(message["content"], list) + assert len(message["content"]) == 1 + for content in message["content"]: + assert isinstance(content, dict) + assert "type" in content + assert "text" in content diff --git a/tests/test_models.py b/tests/test_models.py index cd3c96f24..f663972a7 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -13,14 +13,31 @@ # See the License for the specific language governing permissions and # limitations under the License. import json +import sys import unittest from pathlib import Path from typing import Optional +from unittest.mock import MagicMock, patch +import pytest from transformers.testing_utils import get_tests_dir -from smolagents import ChatMessage, HfApiModel, TransformersModel, models, tool -from smolagents.models import parse_json_if_needed +from smolagents.models import ( + ChatMessage, + HfApiModel, + LiteLLMModel, + MessageRole, + MLXModel, + OpenAIServerModel, + TransformersModel, + get_clean_message_list, + get_tool_json_schema, + parse_json_if_needed, + parse_tool_args_if_needed, +) +from smolagents.tools import tool + +from .utils.markers import require_run_all class ModelTests(unittest.TestCase): @@ -37,27 +54,39 @@ def get_weather(location: str, celsius: Optional[bool] = False) -> str: """ return "The weather is UNGODLY with torrential rains and temperatures below -10°C" - assert ( - "nullable" in models.get_tool_json_schema(get_weather)["function"]["parameters"]["properties"]["celsius"] - ) + assert "nullable" in get_tool_json_schema(get_weather)["function"]["parameters"]["properties"]["celsius"] def test_chatmessage_has_model_dumps_json(self): message = ChatMessage("user", [{"type": "text", "text": "Hello!"}]) data = json.loads(message.model_dump_json()) assert data["content"] == [{"type": "text", "text": "Hello!"}] - def test_get_hfapi_message_no_tool(self): - model = HfApiModel(max_tokens=10) + @unittest.skipUnless(sys.platform.startswith("darwin"), "requires macOS") + def test_get_mlx_message_no_tool(self): + model = MLXModel(model_id="HuggingFaceTB/SmolLM2-135M-Instruct", max_tokens=10) messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}] - model(messages, stop_sequences=["great"]) + output = model(messages, stop_sequences=["great"]).content + assert output.startswith("Hello") + + @unittest.skipUnless(sys.platform.startswith("darwin"), "requires macOS") + def test_get_mlx_message_tricky_stop_sequence(self): + # In this test HuggingFaceTB/SmolLM2-135M-Instruct generates the token ">'" + # which is required to test capturing stop_sequences that have extra chars at the end. + model = MLXModel(model_id="HuggingFaceTB/SmolLM2-135M-Instruct", max_tokens=100) + stop_sequence = " print '>" + messages = [{"role": "user", "content": [{"type": "text", "text": f"Please{stop_sequence}'"}]}] + # check our assumption that that ">" is followed by "'" + assert model.tokenizer.vocab[">'"] + assert model(messages, stop_sequences=[]).content == f"I'm ready to help you{stop_sequence}'" + # check stop_sequence capture when output has trailing chars + assert model(messages, stop_sequences=[stop_sequence]).content == "I'm ready to help you" def test_transformers_message_no_tool(self): model = TransformersModel( model_id="HuggingFaceTB/SmolLM2-135M-Instruct", max_new_tokens=5, - device_map="auto", + device_map="cpu", do_sample=False, - flatten_messages_as_text=True, ) messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}] output = model(messages, stop_sequences=["great"]).content @@ -70,14 +99,18 @@ def test_transformers_message_vl_no_tool(self): model = TransformersModel( model_id="llava-hf/llava-interleave-qwen-0.5b-hf", max_new_tokens=5, - device_map="auto", + device_map="cpu", do_sample=False, - flatten_messages_as_text=False, ) messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}, {"type": "image", "image": img}]}] output = model(messages, stop_sequences=["great"]).content assert output == "Hello! How can" + def test_parse_tool_args_if_needed(self): + original_message = ChatMessage(role="user", content=[{"type": "text", "text": "Hello!"}]) + parsed_message = parse_tool_args_if_needed(original_message) + assert parsed_message == original_message + def test_parse_json_if_needed(self): args = "abc" parsed_args = parse_json_if_needed(args) @@ -94,3 +127,154 @@ def test_parse_json_if_needed(self): args = 3 parsed_args = parse_json_if_needed(args) assert parsed_args == 3 + + +class TestHfApiModel: + def test_call_with_custom_role_conversions(self): + custom_role_conversions = {MessageRole.USER: MessageRole.SYSTEM} + model = HfApiModel(model_id="test-model", custom_role_conversions=custom_role_conversions) + model.client = MagicMock() + messages = [{"role": "user", "content": "Test message"}] + _ = model(messages) + # Verify that the role conversion was applied + assert model.client.chat_completion.call_args.kwargs["messages"][0]["role"] == "system", ( + "role conversion should be applied" + ) + + @require_run_all + def test_get_hfapi_message_no_tool(self): + model = HfApiModel(model="Qwen/Qwen2.5-Coder-32B-Instruct", max_tokens=10) + messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}] + model(messages, stop_sequences=["great"]) + + @require_run_all + def test_get_hfapi_message_no_tool_external_provider(self): + model = HfApiModel(model="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together", max_tokens=10) + messages = [{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}] + model(messages, stop_sequences=["great"]) + + +class TestLiteLLMModel: + @pytest.mark.parametrize( + "model_id, error_flag", + [ + ("groq/llama-3.3-70b", "Missing API Key"), + ("cerebras/llama-3.3-70b", "The api_key client option must be set"), + ("mistral/mistral-tiny", "The api_key client option must be set"), + ], + ) + def test_call_different_providers_without_key(self, model_id, error_flag): + model = LiteLLMModel(model_id=model_id) + messages = [{"role": "user", "content": [{"type": "text", "text": "Test message"}]}] + with pytest.raises(Exception) as e: + # This should raise 401 error because of missing API key, not fail for any "bad format" reason + model(messages) + assert error_flag in str(e) + + def test_passing_flatten_messages(self): + model = LiteLLMModel(model_id="groq/llama-3.3-70b", flatten_messages_as_text=False) + assert not model.flatten_messages_as_text + + model = LiteLLMModel(model_id="fal/llama-3.3-70b", flatten_messages_as_text=True) + assert model.flatten_messages_as_text + + +class TestOpenAIServerModel: + def test_client_kwargs_passed_correctly(self): + model_id = "gpt-3.5-turbo" + api_base = "https://api.openai.com/v1" + api_key = "test_api_key" + organization = "test_org" + project = "test_project" + client_kwargs = {"max_retries": 5} + + with patch("openai.OpenAI") as MockOpenAI: + _ = OpenAIServerModel( + model_id=model_id, + api_base=api_base, + api_key=api_key, + organization=organization, + project=project, + client_kwargs=client_kwargs, + ) + MockOpenAI.assert_called_once_with( + base_url=api_base, api_key=api_key, organization=organization, project=project, max_retries=5 + ) + + +def test_get_clean_message_list_basic(): + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello!"}]}, + {"role": "assistant", "content": [{"type": "text", "text": "Hi there!"}]}, + ] + result = get_clean_message_list(messages) + assert len(result) == 2 + assert result[0]["role"] == "user" + assert result[0]["content"][0]["text"] == "Hello!" + assert result[1]["role"] == "assistant" + assert result[1]["content"][0]["text"] == "Hi there!" + + +def test_get_clean_message_list_role_conversions(): + messages = [ + {"role": "tool-call", "content": [{"type": "text", "text": "Calling tool..."}]}, + {"role": "tool-response", "content": [{"type": "text", "text": "Tool response"}]}, + ] + result = get_clean_message_list(messages, role_conversions={"tool-call": "assistant", "tool-response": "user"}) + assert len(result) == 2 + assert result[0]["role"] == "assistant" + assert result[0]["content"][0]["text"] == "Calling tool..." + assert result[1]["role"] == "user" + assert result[1]["content"][0]["text"] == "Tool response" + + +@pytest.mark.parametrize( + "convert_images_to_image_urls, expected_clean_message", + [ + ( + False, + { + "role": "user", + "content": [ + {"type": "image", "image": "encoded_image"}, + {"type": "image", "image": "second_encoded_image"}, + ], + }, + ), + ( + True, + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "_image"}}, + {"type": "image_url", "image_url": {"url": "_encoded_image"}}, + ], + }, + ), + ], +) +def test_get_clean_message_list_image_encoding(convert_images_to_image_urls, expected_clean_message): + messages = [ + { + "role": "user", + "content": [{"type": "image", "image": b"image_data"}, {"type": "image", "image": b"second_image_data"}], + } + ] + with patch("smolagents.models.encode_image_base64") as mock_encode: + mock_encode.side_effect = ["encoded_image", "second_encoded_image"] + result = get_clean_message_list(messages, convert_images_to_image_urls=convert_images_to_image_urls) + mock_encode.assert_any_call(b"image_data") + mock_encode.assert_any_call(b"second_image_data") + assert len(result) == 1 + assert result[0] == expected_clean_message + + +def test_get_clean_message_list_flatten_messages_as_text(): + messages = [ + {"role": "user", "content": [{"type": "text", "text": "Hello!"}]}, + {"role": "user", "content": [{"type": "text", "text": "How are you?"}]}, + ] + result = get_clean_message_list(messages, flatten_messages_as_text=True) + assert len(result) == 1 + assert result[0]["role"] == "user" + assert result[0]["content"] == "Hello!How are you?" diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py index 9fa30bb63..7483214b1 100644 --- a/tests/test_monitoring.py +++ b/tests/test_monitoring.py @@ -27,7 +27,7 @@ ChatMessageToolCall, ChatMessageToolCallDefinition, ) -from smolagents.utils import AgentLogger, LogLevel +from smolagents.monitoring import AgentLogger, LogLevel class FakeLLMModel: @@ -71,7 +71,7 @@ def test_code_agent_metrics(self): self.assertEqual(agent.monitor.total_input_token_count, 10) self.assertEqual(agent.monitor.total_output_token_count, 20) - def test_json_agent_metrics(self): + def test_toolcalling_agent_metrics(self): agent = ToolCallingAgent( tools=[], model=FakeLLMModel(), @@ -134,7 +134,7 @@ def test_streaming_agent_text_output(self): # Use stream_to_gradio to capture the output outputs = list(stream_to_gradio(agent, task="Test task")) - self.assertEqual(len(outputs), 4) + self.assertEqual(len(outputs), 7) final_message = outputs[-1] self.assertEqual(final_message.role, "assistant") self.assertIn("This is the final answer.", final_message.content) @@ -155,7 +155,7 @@ def test_streaming_agent_image_output(self): ) ) - self.assertEqual(len(outputs), 3) + self.assertEqual(len(outputs), 5) final_message = outputs[-1] self.assertEqual(final_message.role, "assistant") self.assertIsInstance(final_message.content, dict) @@ -177,7 +177,7 @@ def dummy_model(prompt, **kwargs): # Use stream_to_gradio to capture the output outputs = list(stream_to_gradio(agent, task="Test task")) - self.assertEqual(len(outputs), 5) + self.assertEqual(len(outputs), 9) final_message = outputs[-1] self.assertEqual(final_message.role, "assistant") self.assertIn("Simulated agent error", final_message.content) diff --git a/tests/test_search.py b/tests/test_search.py index 7fc6c26df..c146c6a67 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -18,6 +18,7 @@ from smolagents import DuckDuckGoSearchTool from .test_tools import ToolTesterMixin +from .utils.markers import require_run_all class DuckDuckGoSearchToolTester(unittest.TestCase, ToolTesterMixin): @@ -25,6 +26,7 @@ def setUp(self): self.tool = DuckDuckGoSearchTool() self.tool.setup() + @require_run_all def test_exact_match_arg(self): result = self.tool("Agents") assert isinstance(result, str) diff --git a/tests/test_tool_validation.py b/tests/test_tool_validation.py new file mode 100644 index 000000000..f3a94ded2 --- /dev/null +++ b/tests/test_tool_validation.py @@ -0,0 +1,102 @@ +import pytest + +from smolagents.default_tools import DuckDuckGoSearchTool, GoogleSearchTool, SpeechToTextTool, VisitWebpageTool +from smolagents.tool_validation import validate_tool_attributes +from smolagents.tools import Tool + + +UNDEFINED_VARIABLE = "undefined_variable" + + +@pytest.mark.parametrize("tool_class", [DuckDuckGoSearchTool, GoogleSearchTool, SpeechToTextTool, VisitWebpageTool]) +def test_validate_tool_attributes_with_default_tools(tool_class): + assert validate_tool_attributes(tool_class) is None, f"failed for {tool_class.name} tool" + + +class ValidTool(Tool): + name = "valid_tool" + description = "A valid tool" + inputs = {"input": {"type": "string", "description": "input"}} + output_type = "string" + simple_attr = "string" + dict_attr = {"key": "value"} + + def __init__(self, optional_param="default"): + super().__init__() + self.param = optional_param + + def forward(self, input: str) -> str: + return input.upper() + + +def test_validate_tool_attributes_valid(): + assert validate_tool_attributes(ValidTool) is None + + +class InvalidToolComplexAttrs(Tool): + name = "invalid_tool" + description = "Tool with complex class attributes" + inputs = {"input": {"type": "string", "description": "input"}} + output_type = "string" + complex_attr = [x for x in range(3)] # Complex class attribute + + def __init__(self): + super().__init__() + + def forward(self, input: str) -> str: + return input + + +class InvalidToolRequiredParams(Tool): + name = "invalid_tool" + description = "Tool with required params" + inputs = {"input": {"type": "string", "description": "input"}} + output_type = "string" + + def __init__(self, required_param, kwarg1=1): # No default value + super().__init__() + self.param = required_param + + def forward(self, input: str) -> str: + return input + + +class InvalidToolNonLiteralDefaultParam(Tool): + name = "invalid_tool" + description = "Tool with non-literal default parameter value" + inputs = {"input": {"type": "string", "description": "input"}} + output_type = "string" + + def __init__(self, default_param=UNDEFINED_VARIABLE): # UNDEFINED_VARIABLE as default is non-literal + super().__init__() + self.default_param = default_param + + def forward(self, input: str) -> str: + return input + + +class InvalidToolUndefinedNames(Tool): + name = "invalid_tool" + description = "Tool with undefined names" + inputs = {"input": {"type": "string", "description": "input"}} + output_type = "string" + + def forward(self, input: str) -> str: + return UNDEFINED_VARIABLE # Undefined name + + +@pytest.mark.parametrize( + "tool_class, expected_error", + [ + (InvalidToolComplexAttrs, "Complex attributes should be defined in __init__, not as class attributes"), + (InvalidToolRequiredParams, "Parameters in __init__ must have default values, found required parameters"), + ( + InvalidToolNonLiteralDefaultParam, + "Parameters in __init__ must have literal default values, found non-literal defaults", + ), + (InvalidToolUndefinedNames, "Name 'UNDEFINED_VARIABLE' is undefined"), + ], +) +def test_validate_tool_attributes_exceptions(tool_class, expected_error): + with pytest.raises(ValueError, match=expected_error): + validate_tool_attributes(tool_class) diff --git a/tests/test_tools.py b/tests/test_tools.py index e8d5a50ab..4ac48e07d 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -12,6 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import tempfile import unittest from pathlib import Path from textwrap import dedent @@ -25,8 +27,8 @@ from transformers import is_torch_available, is_vision_available from transformers.testing_utils import get_tests_dir +from smolagents.agent_types import _AGENT_TYPE_MAPPING, AgentAudio, AgentImage, AgentText from smolagents.tools import AUTHORIZED_TYPES, Tool, ToolCollection, tool -from smolagents.types import _AGENT_TYPE_MAPPING, AgentAudio, AgentImage, AgentText if is_torch_available(): @@ -214,8 +216,9 @@ def forward(self): return str(datetime.now()) - def test_saving_tool_allows_no_arg_in_init(self): - # Test one cannot save tool with additional args in init + def test_tool_to_dict_allows_no_arg_in_init(self): + """Test that a tool cannot be saved with required args in init""" + class FailTool(Tool): name = "specific" description = "test description" @@ -224,15 +227,31 @@ class FailTool(Tool): def __init__(self, url): super().__init__(self) - self.url = "none" + self.url = url def forward(self, string_input: str) -> str: return self.url + string_input fail_tool = FailTool("dummy_url") with pytest.raises(Exception) as e: - fail_tool.save("output") - assert "__init__" in str(e) + fail_tool.to_dict() + assert "Parameters in __init__ must have default values, found required parameters" in str(e) + + class PassTool(Tool): + name = "specific" + description = "test description" + inputs = {"string_input": {"type": "string", "description": "input description"}} + output_type = "string" + + def __init__(self, url: Optional[str] = "none"): + super().__init__(self) + self.url = url + + def forward(self, string_input: str) -> str: + return self.url + string_input + + fail_tool = PassTool() + fail_tool.to_dict() def test_saving_tool_allows_no_imports_from_outside_methods(self): # Test that using imports from outside functions fails @@ -399,7 +418,10 @@ def get_weather(location: Any) -> None: """ return + with tempfile.TemporaryDirectory() as tmp_dir: + get_weather.save(tmp_dir) assert get_weather.inputs["location"]["type"] == "any" + assert get_weather.output_type == "null" def test_tool_supports_array(self): @tool @@ -416,6 +438,46 @@ def get_weather(locations: List[str], months: Optional[Tuple[str, str]] = None) assert get_weather.inputs["locations"]["type"] == "array" assert get_weather.inputs["months"]["type"] == "array" + def test_saving_tool_produces_valid_pyhon_code_with_multiline_description(self): + @tool + def get_weather(location: Any) -> None: + """ + Get weather in the next days at given location. + And works pretty well. + + Args: + location: The location to get the weather for. + """ + return + + with tempfile.TemporaryDirectory() as tmp_dir: + get_weather.save(tmp_dir) + with open(os.path.join(tmp_dir, "tool.py"), "r", encoding="utf-8") as f: + source_code = f.read() + compile(source_code, f.name, "exec") + + def test_saving_tool_produces_valid_python_code_with_complex_name(self): + # Test one cannot save tool with additional args in init + class FailTool(Tool): + name = 'spe"\rcific' + description = """test \n\r + description""" + inputs = {"string_input": {"type": "string", "description": "input description"}} + output_type = "string" + + def __init__(self): + super().__init__(self) + + def forward(self, string_input): + return "foo" + + fail_tool = FailTool() + with tempfile.TemporaryDirectory() as tmp_dir: + fail_tool.save(tmp_dir) + with open(os.path.join(tmp_dir, "tool.py"), "r", encoding="utf-8") as f: + source_code = f.read() + compile(source_code, f.name, "exec") + @pytest.fixture def mock_server_parameters(): diff --git a/tests/test_types.py b/tests/test_types.py index 9350da17a..73465d0ed 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -25,7 +25,7 @@ require_vision, ) -from smolagents.types import AgentAudio, AgentImage, AgentText +from smolagents.agent_types import AgentAudio, AgentImage, AgentText def get_new_path(suffix="") -> str: diff --git a/tests/test_utils.py b/tests/test_utils.py index 31a8a68e0..16ba39141 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -109,17 +109,17 @@ def test_func(): ... def test_get_source_ipython_errors_empty_cells(ipython_shell): test_code = textwrap.dedent("""class TestClass:\n ...""").strip() ipython_shell.user_ns["In"] = [""] - exec(test_code) + ipython_shell.run_cell(test_code, store_history=True) with pytest.raises(ValueError, match="No code cells found in IPython session"): - get_source(locals()["TestClass"]) + get_source(ipython_shell.user_ns["TestClass"]) def test_get_source_ipython_errors_definition_not_found(ipython_shell): test_code = textwrap.dedent("""class TestClass:\n ...""").strip() ipython_shell.user_ns["In"] = ["", "print('No class definition here')"] - exec(test_code) + ipython_shell.run_cell(test_code, store_history=True) with pytest.raises(ValueError, match="Could not find source code for TestClass in IPython history"): - get_source(locals()["TestClass"]) + get_source(ipython_shell.user_ns["TestClass"]) def test_get_source_ipython_errors_type_error(): @@ -146,11 +146,12 @@ def forward(self, task: str): test_tool = TestTool() with tempfile.TemporaryDirectory() as tmp_dir: - test_tool.save(tmp_dir) + test_tool.save(tmp_dir, make_gradio_app=True) assert set(os.listdir(tmp_dir)) == {"requirements.txt", "app.py", "tool.py"} assert ( pathlib.Path(tmp_dir, "tool.py").read_text() - == """from smolagents.tools import Tool + == """from typing import Any, Optional +from smolagents.tools import Tool import IPython class TestTool(Tool): @@ -173,7 +174,6 @@ def __init__(self, *args, **kwargs): assert ( pathlib.Path(tmp_dir, "app.py").read_text() == """from smolagents import launch_gradio_demo -from typing import Optional from tool import TestTool tool = TestTool() @@ -201,13 +201,14 @@ def forward(self, task: str): import IPython # noqa: F401 return task - TestTool().save("{tmp_dir}") + TestTool().save("{tmp_dir}", make_gradio_app=True) """) assert shell.run_cell(code_blob, store_history=True).success assert set(os.listdir(tmp_dir)) == {"requirements.txt", "app.py", "tool.py"} assert ( pathlib.Path(tmp_dir, "tool.py").read_text() - == """from smolagents.tools import Tool + == """from typing import Any, Optional +from smolagents.tools import Tool import IPython class TestTool(Tool): @@ -230,7 +231,6 @@ def __init__(self, *args, **kwargs): assert ( pathlib.Path(tmp_dir, "app.py").read_text() == """from smolagents import launch_gradio_demo -from typing import Optional from tool import TestTool tool = TestTool() @@ -254,12 +254,12 @@ def test_tool(task: str) -> str: return task with tempfile.TemporaryDirectory() as tmp_dir: - test_tool.save(tmp_dir) + test_tool.save(tmp_dir, make_gradio_app=True) assert set(os.listdir(tmp_dir)) == {"requirements.txt", "app.py", "tool.py"} assert ( pathlib.Path(tmp_dir, "tool.py").read_text() == """from smolagents import Tool -from typing import Optional +from typing import Any, Optional class SimpleTool(Tool): name = "test_tool" @@ -283,7 +283,6 @@ def forward(self, task: str) -> str: assert ( pathlib.Path(tmp_dir, "app.py").read_text() == """from smolagents import launch_gradio_demo -from typing import Optional from tool import SimpleTool tool = SimpleTool() @@ -311,14 +310,14 @@ def test_tool(task: str) -> str: return task - test_tool.save("{tmp_dir}") + test_tool.save("{tmp_dir}", make_gradio_app=True) """) assert shell.run_cell(code_blob, store_history=True).success assert set(os.listdir(tmp_dir)) == {"requirements.txt", "app.py", "tool.py"} assert ( pathlib.Path(tmp_dir, "tool.py").read_text() == """from smolagents import Tool -from typing import Optional +from typing import Any, Optional class SimpleTool(Tool): name = "test_tool" @@ -342,7 +341,6 @@ def forward(self, task: str) -> str: assert ( pathlib.Path(tmp_dir, "app.py").read_text() == """from smolagents import launch_gradio_demo -from typing import Optional from tool import SimpleTool tool = SimpleTool() diff --git a/tests/utils/markers.py b/tests/utils/markers.py new file mode 100644 index 000000000..8901f5f25 --- /dev/null +++ b/tests/utils/markers.py @@ -0,0 +1,22 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Markers for tests .""" + +import os + +import pytest + + +require_run_all = pytest.mark.skipif(not os.getenv("RUN_ALL"), reason="requires RUN_ALL environment variable") diff --git a/utils/check_tests_in_ci.py b/utils/check_tests_in_ci.py index 65ebca729..b320e23e7 100644 --- a/utils/check_tests_in_ci.py +++ b/utils/check_tests_in_ci.py @@ -30,7 +30,7 @@ def check_tests_in_ci(): tests, hence this check. NOTE: current implementation is quite naive but should work for now. Must be updated if one want to ignore some - tests or if file naming is updated (currently only files starting by `test_*` are cheked) + tests or if file naming is updated (currently only files starting by `test_*` are checked) """ test_files = [ path.relative_to(TESTS_FOLDER).as_posix()