diff --git a/README.md b/README.md index 9d90637fb..37a5e8a3f 100644 --- a/README.md +++ b/README.md @@ -51,16 +51,16 @@ Full documentation can be found [here](https://huggingface.co/docs/smolagents/in ## Quick demo -First install the package. +First install the package with a default set of tools: ```bash -pip install smolagents +pip install smolagents[toolkit] ``` Then define your agent, give it the tools it needs and run it! ```py -from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel +from smolagents import CodeAgent, WebSearchTool, InferenceClientModel model = InferenceClientModel() -agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) +agent = CodeAgent(tools=[WebSearchTool()], model=model) agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") ``` diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index c5c2a9a93..9f7d6ce6e 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -2,6 +2,8 @@ sections: - local: index title: 🤗 Agents + - local: installation + title: Installation - local: guided_tour title: Guided tour - title: Tutorials @@ -32,6 +34,8 @@ title: Orchestrate a multi-agent system - local: examples/web_browser title: Build a web browser agent using vision models + - local: examples/using_different_models + title: Using different models - title: Reference sections: - local: reference/agents diff --git a/docs/source/en/examples/multiagents.mdx b/docs/source/en/examples/multiagents.mdx index 4e43f99f5..231f8f0a8 100644 --- a/docs/source/en/examples/multiagents.mdx +++ b/docs/source/en/examples/multiagents.mdx @@ -25,7 +25,7 @@ Let's set up this system. Run the line below to install the required dependencies: ```py -! pip install markdownify duckduckgo-search smolagents --upgrade -q +!pip install smolagents[toolkit] --upgrade -q ``` Let's login to HF in order to call Inference Providers: @@ -46,9 +46,9 @@ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct" ## 🔍 Create a web search tool -For web browsing, we can already use our pre-existing [`DuckDuckGoSearchTool`](https://github.com/huggingface/smolagents/blob/main/src/smolagents/default_tools.py#L151-L176) tool to provide a Google search equivalent. +For web browsing, we can already use our native [`WebSearchTool`] tool to provide a Google search equivalent. -But then we will also need to be able to peak into the page found by the `DuckDuckGoSearchTool`. +But then we will also need to be able to peak into the page found by the `WebSearchTool`. To do so, we could import the library's built-in `VisitWebpageTool`, but we will build it again to see how it's done. So let's create our `VisitWebpageTool` tool from scratch using `markdownify`. @@ -109,14 +109,14 @@ from smolagents import ( CodeAgent, ToolCallingAgent, InferenceClientModel, - DuckDuckGoSearchTool, + WebSearchTool, LiteLLMModel, ) model = InferenceClientModel(model_id=model_id) web_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), visit_webpage], + tools=[WebSearchTool(), visit_webpage], model=model, max_steps=10, name="web_search_agent", diff --git a/docs/source/en/examples/using_different_models.mdx b/docs/source/en/examples/using_different_models.mdx new file mode 100644 index 000000000..1c153d349 --- /dev/null +++ b/docs/source/en/examples/using_different_models.mdx @@ -0,0 +1,48 @@ +# Using different models + +[[open-in-colab]] + +`smolagents` provides a flexible framework that allows you to use various language models from different providers. +This guide will show you how to use different model types with your agents. + +## Available model types + +`smolagents` supports several model types out of the box: +1. [`InferenceClientModel`]: Uses Hugging Face's Inference API to access models +2. [`TransformersModel`]: Runs models locally using the Transformers library +3. [`VLLMModel`]: Uses vLLM for fast inference with optimized serving +4. [`MLXModel`]: Optimized for Apple Silicon devices using MLX +5. [`LiteLLMModel`]: Provides access to hundreds of LLMs through LiteLLM +6. [`LiteLLMRouterModel`]: Distributes requests among multiple models +7. [`OpenAIServerModel`]: Connects to OpenAI's API +8. [`AzureOpenAIServerModel`]: Uses Azure's OpenAI service +9. [`AmazonBedrockServerModel`]: Connects to AWS Bedrock's API + +## Using Google Gemini Models + +As explained in the Google Gemini API documentation (https://ai.google.dev/gemini-api/docs/openai), +Google provides an OpenAI-compatible API for Gemini models, allowing you to use the [`OpenAIServerModel`] +with Gemini models by setting the appropriate base URL. + +First, install the required dependencies: +```bash +pip install smolagents[openai] +``` + +Then, [get a Gemini API key](https://ai.google.dev/gemini-api/docs/api-key) and set it in your code: +```python +GEMINI_API_KEY = +``` + +Now, you can initialize the Gemini model using the `OpenAIServerModel` class +and setting the `api_base` parameter to the Gemini API base URL: +```python +from smolagents import OpenAIServerModel + +model = OpenAIServerModel( + model_id="gemini-2.0-flash", + api_key=GEMINI_API_KEY, + # Google Gemini OpenAI-compatible API base URL + api_base="https://generativelanguage.googleapis.com/v1beta/openai/", +) +``` diff --git a/docs/source/en/guided_tour.mdx b/docs/source/en/guided_tour.mdx index 01e247357..859ced0ac 100644 --- a/docs/source/en/guided_tour.mdx +++ b/docs/source/en/guided_tour.mdx @@ -281,7 +281,7 @@ When the agent is initialized, the tool attributes are used to generate a tool d ### Default toolbox -`smolagents` comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools=True`: +If you install `smolagents` with the "toolkit" extra, it comes with a default toolbox for empowering agents, that you can add to your agent upon initialization with argument `add_base_tools=True`: - **DuckDuckGo web search***: performs a web search using DuckDuckGo browser. - **Python code interpreter**: runs your LLM generated Python code in a secure environment. This tool will only be added to [`ToolCallingAgent`] if you initialize it with `add_base_tools=True`, since code-based agent can already natively execute Python code @@ -290,9 +290,10 @@ When the agent is initialized, the tool attributes are used to generate a tool d You can manually use a tool by calling it with its arguments. ```python -from smolagents import DuckDuckGoSearchTool +# !pip install smolagents[toolkit] +from smolagents import WebSearchTool -search_tool = DuckDuckGoSearchTool() +search_tool = WebSearchTool() print(search_tool("Who's the current president of Russia?")) ``` @@ -339,7 +340,7 @@ def model_download_tool(task: str) -> str: The function needs: - A clear name. The name should be descriptive enough of what this tool does to help the LLM brain powering the agent. Since this tool returns the model with the most downloads for a task, let's name it `model_download_tool`. - Type hints on both inputs and output -- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint). Same as for the tool name, this description is an instruction manual for the LLM powering you agent, so do not neglect it. +- A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint). Same as for the tool name, this description is an instruction manual for the LLM powering your agent, so do not neglect it. All these elements will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible! @@ -364,7 +365,7 @@ class ModelDownloadTool(Tool): The subclass needs the following attributes: - A clear `name`. The name should be descriptive enough of what this tool does to help the LLM brain powering the agent. Since this tool returns the model with the most downloads for a task, let's name it `model_download_tool`. -- A `description`. Same as for the `name`, this description is an instruction manual for the LLM powering you agent, so do not neglect it. +- A `description`. Same as for the `name`, this description is an instruction manual for the LLM powering your agent, so do not neglect it. - Input types and descriptions - Output type All these attributes will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible! @@ -423,15 +424,15 @@ You can easily build hierarchical multi-agent systems with `smolagents`. To do so, just ensure your agent has `name` and`description` attributes, which will then be embedded in the manager agent's system prompt to let it know how to call this managed agent, as we also do for tools. Then you can pass this managed agent in the parameter managed_agents upon initialization of the manager agent. -Here's an example of making an agent that managed a specific web search agent using our [`DuckDuckGoSearchTool`]: +Here's an example of making an agent that managed a specific web search agent using our native [`WebSearchTool`]: ```py -from smolagents import CodeAgent, InferenceClientModel, DuckDuckGoSearchTool +from smolagents import CodeAgent, InferenceClientModel, WebSearchTool model = InferenceClientModel() web_agent = CodeAgent( - tools=[DuckDuckGoSearchTool()], + tools=[WebSearchTool()], model=model, name="web_search", description="Runs web searches for you. Give it your query as an argument." diff --git a/docs/source/en/installation.mdx b/docs/source/en/installation.mdx new file mode 100644 index 000000000..53069e61e --- /dev/null +++ b/docs/source/en/installation.mdx @@ -0,0 +1,114 @@ +# Installation Guide + +The `smolagents` library can be installed using pip. Here are the different installation methods and options available. + +## Prerequisites +- Python 3.10 or newer +- pip + +## Basic Installation + +Install `smolagents` core library with: +```bash +pip install smolagents +``` + +## Installation with Extras + +`smolagents` provides several optional dependencies (extras) that can be installed based on your needs. +You can install these extras using the following syntax: +```bash +pip install "smolagents[extra1,extra2]" +``` + +### Tools +These extras include various tools and integrations: +- **toolkit**: Install a default set of tools for common tasks. + ```bash + pip install "smolagents[toolkit]" + ``` +- **mcp**: Add support for the Model Context Protocol (MCP) to integrate with external tools and services. + ```bash + pip install "smolagents[mcp]" + ``` + +### Model Integration +These extras enable integration with various AI models and frameworks: +- **openai**: Add support for OpenAI API models. + ```bash + pip install "smolagents[openai]" + ``` +- **transformers**: Enable Hugging Face Transformers models. + ```bash + pip install "smolagents[transformers]" + ``` +- **vllm**: Add VLLM support for efficient model inference. + ```bash + pip install "smolagents[vllm]" + ``` +- **mlx-lm**: Enable support for MLX-LM models. + ```bash + pip install "smolagents[mlx-lm]" + ``` +- **litellm**: Add LiteLLM support for lightweight model inference. + ```bash + pip install "smolagents[litellm]" + ``` +- **bedrock**: Enable support for AWS Bedrock models. + ```bash + pip install "smolagents[bedrock]" + ``` + +### Multimodal Capabilities +Extras for handling different types of media and input: +- **vision**: Add support for image processing and computer vision tasks. + ```bash + pip install "smolagents[vision]" + ``` +- **audio**: Enable audio processing capabilities. + ```bash + pip install "smolagents[audio]" + ``` + +### Remote Execution +Extras for executing code remotely: +- **docker**: Add support for executing code in Docker containers. + ```bash + pip install "smolagents[docker]" + ``` +- **e2b**: Enable E2B support for remote execution. + ```bash + pip install "smolagents[e2b]" + ``` + +### Telemetry and User Interface +Extras for telemetry, monitoring and user interface components: +- **telemetry**: Add support for monitoring and tracing. + ```bash + pip install "smolagents[telemetry]" + ``` +- **gradio**: Add support for interactive Gradio UI components. + ```bash + pip install "smolagents[gradio]" + ``` + +### Complete Installation +To install all available extras, you can use: +```bash +pip install "smolagents[all]" +``` + +## Verifying Installation +After installation, you can verify that `smolagents` is installed correctly by running: +```python +import smolagents +print(smolagents.__version__) +``` + +## Next Steps +Once you have successfully installed `smolagents`, you can: +- Follow the [guided tour](./guided_tour) to learn the basics. +- Explore the [how-to guides](./examples/text_to_sql) for practical examples. +- Read the [conceptual guides](./conceptual_guides/intro_agents) for high-level explanations. +- Check out the [tutorials](./tutorials/building_good_agents) for in-depth tutorials on building agents. +- Explore the [API reference](./reference/index) for detailed information on classes and functions. diff --git a/docs/source/en/reference/tools.mdx b/docs/source/en/reference/tools.mdx index a5d217bb8..ee96f71e8 100644 --- a/docs/source/en/reference/tools.mdx +++ b/docs/source/en/reference/tools.mdx @@ -42,6 +42,10 @@ contains the API docs for the underlying classes. [[autodoc]] UserInputTool +### WebSearchTool + +[[autodoc]] WebSearchTool + ### DuckDuckGoSearchTool [[autodoc]] DuckDuckGoSearchTool diff --git a/docs/source/en/tutorials/building_good_agents.mdx b/docs/source/en/tutorials/building_good_agents.mdx index 53bda8f92..a9c2a79ac 100644 --- a/docs/source/en/tutorials/building_good_agents.mdx +++ b/docs/source/en/tutorials/building_good_agents.mdx @@ -397,7 +397,7 @@ This also works with the [`ToolCallingAgent`]. We provide a model for a supplementary planning step, that an agent can run regularly in-between normal action steps. In this step, there is no tool call, the LLM is simply asked to update a list of facts it knows and to reflect on what steps it should take next based on those facts. ```py -from smolagents import load_tool, CodeAgent, InferenceClientModel, DuckDuckGoSearchTool +from smolagents import load_tool, CodeAgent, InferenceClientModel, WebSearchTool from dotenv import load_dotenv load_dotenv() @@ -405,7 +405,7 @@ load_dotenv() # Import tool from Hub image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True) -search_tool = DuckDuckGoSearchTool() +search_tool = WebSearchTool() agent = CodeAgent( tools=[search_tool, image_generation_tool], diff --git a/docs/source/en/tutorials/inspect_runs.mdx b/docs/source/en/tutorials/inspect_runs.mdx index 333db728b..dbc141595 100644 --- a/docs/source/en/tutorials/inspect_runs.mdx +++ b/docs/source/en/tutorials/inspect_runs.mdx @@ -30,7 +30,7 @@ Here's how it then looks like on the platform: First install the required packages. Here we install [Phoenix by Arize AI](https://github.com/Arize-ai/phoenix) because that's a good solution to collect and inspect the logs, but there are other OpenTelemetry-compatible platforms that you could use for this collection & inspection part. ```shell -pip install 'smolagents[telemetry]' +pip install 'smolagents[telemetry,toolkit]' ``` Then run the collector in the background. @@ -54,7 +54,7 @@ Then you can run your agents! from smolagents import ( CodeAgent, ToolCallingAgent, - DuckDuckGoSearchTool, + WebSearchTool, VisitWebpageTool, InferenceClientModel, ) @@ -62,7 +62,7 @@ from smolagents import ( model = InferenceClientModel() search_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="search_agent", description="This is an agent that can do web search.", @@ -143,7 +143,7 @@ SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) from smolagents import ( CodeAgent, ToolCallingAgent, - DuckDuckGoSearchTool, + WebSearchTool, VisitWebpageTool, InferenceClientModel, ) @@ -153,7 +153,7 @@ model = InferenceClientModel( ) search_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="search_agent", description="This is an agent that can do web search.", diff --git a/docs/source/en/tutorials/memory.mdx b/docs/source/en/tutorials/memory.mdx index df982da82..ad35e337b 100644 --- a/docs/source/en/tutorials/memory.mdx +++ b/docs/source/en/tutorials/memory.mdx @@ -83,7 +83,7 @@ Then you should pass this function in the `step_callbacks` argument upon initial ```py CodeAgent( - tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], + tools=[WebSearchTool(), go_back, close_popups, search_item_ctrl_f], model=model, additional_authorized_imports=["helium"], step_callbacks=[update_screenshot], diff --git a/docs/source/en/tutorials/tools.mdx b/docs/source/en/tutorials/tools.mdx index a6b24d280..cf2b38942 100644 --- a/docs/source/en/tutorials/tools.mdx +++ b/docs/source/en/tutorials/tools.mdx @@ -99,7 +99,7 @@ model_download_tool = load_tool( You can directly import a Gradio Space from the Hub as a tool using the [`Tool.from_space`] method! -You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space. +You only need to provide the id of the Space on the Hub, its name, and a description that will help your agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space. For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image. diff --git a/docs/source/hi/examples/multiagents.mdx b/docs/source/hi/examples/multiagents.mdx index 7ee85f92d..1c17312e9 100644 --- a/docs/source/hi/examples/multiagents.mdx +++ b/docs/source/hi/examples/multiagents.mdx @@ -49,9 +49,9 @@ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct" ## 🔍 एक वेब सर्च टूल बनाएं -वेब ब्राउज़िंग के लिए, हम पहले से मौजूद [`DuckDuckGoSearchTool`](https://github.com/huggingface/smolagents/blob/main/src/smolagents/default_tools.py#L151-L176) टूल का उपयोग कर सकते हैं जो Google search के समान सुविधा प्रदान करता है। +वेब ब्राउज़िंग के लिए, हम पहले से मौजूद [`WebSearchTool`] टूल का उपयोग कर सकते हैं जो Google search के समान सुविधा प्रदान करता है। -लेकिन फिर हमें `DuckDuckGoSearchTool` द्वारा खोजे गए पेज को देखने में भी सक्षम होने की आवश्यकता होगी। +लेकिन फिर हमें `WebSearchTool` द्वारा खोजे गए पेज को देखने में भी सक्षम होने की आवश्यकता होगी। ऐसा करने के लिए, हम लाइब्रेरी के बिल्ट-इन `VisitWebpageTool` को इम्पोर्ट कर सकते हैं, लेकिन हम इसे फिर से बनाएंगे यह देखने के लिए कि यह कैसे किया जाता है। तो आइए `markdownify` का उपयोग करके शुरू से अपना `VisitWebpageTool` टूल बनाएं। @@ -113,14 +113,14 @@ from smolagents import ( ToolCallingAgent, InferenceClientModel, ManagedAgent, - DuckDuckGoSearchTool, + WebSearchTool, LiteLLMModel, ) model = InferenceClientModel(model_id=model_id) web_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), visit_webpage], + tools=[WebSearchTool(), visit_webpage], model=model, max_steps=10, ) diff --git a/docs/source/hi/guided_tour.mdx b/docs/source/hi/guided_tour.mdx index 1c7f5742e..59f3c39db 100644 --- a/docs/source/hi/guided_tour.mdx +++ b/docs/source/hi/guided_tour.mdx @@ -152,9 +152,9 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co आप मैन्युअल रूप से एक टूल का उपयोग उसके आर्ग्यूमेंट्स के साथ कॉल करके कर सकते हैं। ```python -from smolagents import DuckDuckGoSearchTool +from smolagents import WebSearchTool -search_tool = DuckDuckGoSearchTool() +search_tool = WebSearchTool() print(search_tool("Who's the current president of Russia?")) ``` @@ -283,14 +283,14 @@ Microsoft के फ्रेमवर्क [Autogen](https://huggingface.co/pa ऐसा करने के लिए, एजेंट को [`ManagedAgent`] ऑब्जेक्ट में समाहित करें। यह ऑब्जेक्ट `agent`, `name`, और एक `description` जैसे तर्कों की आवश्यकता होती है, जो फिर मैनेजर एजेंट की सिस्टम प्रॉम्प्ट में एम्बेड किया जाता है -यहां एक एजेंट बनाने का उदाहरण दिया गया है जो हमारे [`DuckDuckGoSearchTool`] का उपयोग करके एक विशिष्ट वेब खोज एजेंट को प्रबंधित करता है। +यहां एक एजेंट बनाने का उदाहरण दिया गया है जो हमारे [`WebSearchTool`] का उपयोग करके एक विशिष्ट वेब खोज एजेंट को प्रबंधित करता है। ```py -from smolagents import CodeAgent, InferenceClientModel, DuckDuckGoSearchTool, ManagedAgent +from smolagents import CodeAgent, InferenceClientModel, WebSearchTool, ManagedAgent model = InferenceClientModel() -web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) +web_agent = CodeAgent(tools=[WebSearchTool()], model=model) managed_web_agent = ManagedAgent( agent=web_agent, diff --git a/docs/source/hi/tutorials/building_good_agents.mdx b/docs/source/hi/tutorials/building_good_agents.mdx index 0baa206f6..e074f14ad 100644 --- a/docs/source/hi/tutorials/building_good_agents.mdx +++ b/docs/source/hi/tutorials/building_good_agents.mdx @@ -397,7 +397,7 @@ This also works with the [`ToolCallingAgent`]. हम पूरक योजना चरण के लिए एक मॉडल प्रदान करते हैं, जिसे एजेंट सामान्य क्रियाओं के चरणों के बीच नियमित रूप से चला सकता है। इस चरण में कोई टूल कॉल नहीं होती है, LLM से केवल उन तथ्यों की सूची को अपडेट करने के लिए कहा जाता है जो उसे ज्ञात हैं और इन तथ्यों के आधार पर उसे अगले कदमों के बारे में विचार करना होता है। ```py -from smolagents import load_tool, CodeAgent, InferenceClientModel, DuckDuckGoSearchTool +from smolagents import load_tool, CodeAgent, InferenceClientModel, WebSearchTool from dotenv import load_dotenv load_dotenv() @@ -405,7 +405,7 @@ load_dotenv() # Import tool from Hub image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True) -search_tool = DuckDuckGoSearchTool() +search_tool = WebSearchTool() agent = CodeAgent( tools=[search_tool], diff --git a/docs/source/hi/tutorials/inspect_runs.mdx b/docs/source/hi/tutorials/inspect_runs.mdx index 127bca148..a42ecde58 100644 --- a/docs/source/hi/tutorials/inspect_runs.mdx +++ b/docs/source/hi/tutorials/inspect_runs.mdx @@ -56,7 +56,7 @@ SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) from smolagents import ( CodeAgent, ToolCallingAgent, - DuckDuckGoSearchTool, + WebSearchTool, VisitWebpageTool, InferenceClientModel, ) @@ -64,7 +64,7 @@ from smolagents import ( model = InferenceClientModel() managed_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="managed_agent", description="This is an agent that can do web search.", diff --git a/docs/source/zh/examples/multiagents.mdx b/docs/source/zh/examples/multiagents.mdx index 567e7573f..a30d9e2b3 100644 --- a/docs/source/zh/examples/multiagents.mdx +++ b/docs/source/zh/examples/multiagents.mdx @@ -50,8 +50,8 @@ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct" ## 🔍 创建网络搜索工具 虽然我们可以使用已经存在的 -[`DuckDuckGoSearchTool`](https://github.com/huggingface/smolagents/blob/main/src/smolagents/default_tools.py#L151-L176) -工具作为谷歌搜索的平替进行网页浏览,然后我们也需要能够查看`DuckDuckGoSearchTool`找到的页面。为此,我 +[`WebSearchTool`] +工具作为谷歌搜索的平替进行网页浏览,然后我们也需要能够查看`WebSearchTool`找到的页面。为此,我 们可以直接导入库的内置 `VisitWebpageTool`。但是我们将重新构建它以了解其工作原理。 @@ -114,14 +114,14 @@ from smolagents import ( ToolCallingAgent, InferenceClientModel, ManagedAgent, - DuckDuckGoSearchTool, + WebSearchTool, LiteLLMModel, ) model = InferenceClientModel(model_id=model_id) web_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), visit_webpage], + tools=[WebSearchTool(), visit_webpage], model=model, max_steps=10, name="search", diff --git a/docs/source/zh/guided_tour.mdx b/docs/source/zh/guided_tour.mdx index e851b79b8..f294e7a6e 100644 --- a/docs/source/zh/guided_tour.mdx +++ b/docs/source/zh/guided_tour.mdx @@ -221,9 +221,9 @@ agent.run("Could you get me the title of the page at url 'https://huggingface.co 您可以通过调用 [`load_tool`] 函数和要执行的任务手动使用工具。 ```python -from smolagents import DuckDuckGoSearchTool +from smolagents import WebSearchTool -search_tool = DuckDuckGoSearchTool() +search_tool = WebSearchTool() print(search_tool("Who's the current president of Russia?")) ``` @@ -352,14 +352,14 @@ Out[20]: 'ByteDance/AnimateDiff-Lightning' 为此,将 agent 封装在 [`ManagedAgent`] 对象中。此对象需要参数 `agent`、`name` 和 `description`,这些参数将嵌入到管理 agent 的系统提示中,以让它知道如何调用此托管 agent,就像我们对工具所做的那样。 -以下是一个使用我们的 [`DuckDuckGoSearchTool`] 制作一个管理特定网页搜索 agent 的 agent 的示例: +以下是一个使用我们的 [`WebSearchTool`] 制作一个管理特定网页搜索 agent 的 agent 的示例: ```py -from smolagents import CodeAgent, InferenceClientModel, DuckDuckGoSearchTool, ManagedAgent +from smolagents import CodeAgent, InferenceClientModel, WebSearchTool, ManagedAgent model = InferenceClientModel() -web_agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) +web_agent = CodeAgent(tools=[WebSearchTool()], model=model) managed_web_agent = ManagedAgent( agent=web_agent, diff --git a/docs/source/zh/tutorials/building_good_agents.mdx b/docs/source/zh/tutorials/building_good_agents.mdx index a70d251ce..5fca995b7 100644 --- a/docs/source/zh/tutorials/building_good_agents.mdx +++ b/docs/source/zh/tutorials/building_good_agents.mdx @@ -395,7 +395,7 @@ agent.prompt_templates["system_prompt"] = agent.prompt_templates["system_prompt" 我们提供了一个用于补充规划步骤的模型,agent 可以在正常操作步骤之间定期运行。在此步骤中,没有工具调用,LLM 只是被要求更新它知道的事实列表,并根据这些事实反推它应该采取的下一步。 ```py -from smolagents import load_tool, CodeAgent, InferenceClientModel, DuckDuckGoSearchTool +from smolagents import load_tool, CodeAgent, InferenceClientModel, WebSearchTool from dotenv import load_dotenv load_dotenv() @@ -403,7 +403,7 @@ load_dotenv() # 从 Hub 导入工具 image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True) -search_tool = DuckDuckGoSearchTool() +search_tool = WebSearchTool() agent = CodeAgent( tools=[search_tool], diff --git a/docs/source/zh/tutorials/inspect_runs.mdx b/docs/source/zh/tutorials/inspect_runs.mdx index ea3eb659b..f6f4be8b8 100644 --- a/docs/source/zh/tutorials/inspect_runs.mdx +++ b/docs/source/zh/tutorials/inspect_runs.mdx @@ -56,7 +56,7 @@ SmolagentsInstrumentor().instrument() from smolagents import ( CodeAgent, ToolCallingAgent, - DuckDuckGoSearchTool, + WebSearchTool, VisitWebpageTool, InferenceClientModel, ) @@ -64,7 +64,7 @@ from smolagents import ( model = InferenceClientModel() search_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="search_agent", description="This is an agent that can do web search.", @@ -145,7 +145,7 @@ SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) from smolagents import ( CodeAgent, ToolCallingAgent, - DuckDuckGoSearchTool, + WebSearchTool, VisitWebpageTool, InferenceClientModel, ) @@ -155,7 +155,7 @@ model = InferenceClientModel( ) search_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="search_agent", description="This is an agent that can do web search.", diff --git a/docs/source/zh/tutorials/memory.mdx b/docs/source/zh/tutorials/memory.mdx index de2bdc8c3..900128f37 100644 --- a/docs/source/zh/tutorials/memory.mdx +++ b/docs/source/zh/tutorials/memory.mdx @@ -82,7 +82,7 @@ def update_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: ```py CodeAgent( - tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], + tools=[WebSearchTool(), go_back, close_popups, search_item_ctrl_f], model=model, additional_authorized_imports=["helium"], step_callbacks=[update_screenshot], diff --git a/examples/agent_from_any_llm.py b/examples/agent_from_any_llm.py index bc421274c..d5e33f0a1 100644 --- a/examples/agent_from_any_llm.py +++ b/examples/agent_from_any_llm.py @@ -52,6 +52,6 @@ def get_weather(location: str, celsius: bool | None = False) -> str: print("ToolCallingAgent:", agent.run("What's the weather like in Paris?")) -agent = CodeAgent(tools=[get_weather], model=model, verbosity_level=2) +agent = CodeAgent(tools=[get_weather], model=model, verbosity_level=2, stream_outputs=True) print("CodeAgent:", agent.run("What's the weather like in Paris?")) diff --git a/examples/gradio_ui.py b/examples/gradio_ui.py index 81c56a1f2..87f532689 100644 --- a/examples/gradio_ui.py +++ b/examples/gradio_ui.py @@ -1,25 +1,15 @@ -from io import BytesIO - -import requests -from PIL import Image - from smolagents import CodeAgent, GradioUI, InferenceClientModel -def add_agent_image(memory_step, agent): - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/smolagents.png" - response = requests.get(url) - memory_step.observations_images = [Image.open(BytesIO(response.content))] - - agent = CodeAgent( tools=[], model=InferenceClientModel(), verbosity_level=1, planning_interval=3, name="example_agent", - description="This is an example agent that has not tool but will always see an agent at the end of its step.", - step_callbacks=[add_agent_image], + description="This is an example agent.", + step_callbacks=[], + stream_outputs=False, ) GradioUI(agent, file_upload_folder="./data").launch() diff --git a/examples/inspect_multiagent_run.py b/examples/inspect_multiagent_run.py index 95032cd34..c68dccb75 100644 --- a/examples/inspect_multiagent_run.py +++ b/examples/inspect_multiagent_run.py @@ -8,10 +8,10 @@ from smolagents import ( CodeAgent, - DuckDuckGoSearchTool, InferenceClientModel, ToolCallingAgent, VisitWebpageTool, + WebSearchTool, ) @@ -19,7 +19,7 @@ model = InferenceClientModel() search_agent = ToolCallingAgent( - tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], + tools=[WebSearchTool(), VisitWebpageTool()], model=model, name="search_agent", description="This is an agent that can do web search.", diff --git a/examples/multi_llm_agent.py b/examples/multi_llm_agent.py index 186fa06f8..6f44ff8b4 100644 --- a/examples/multi_llm_agent.py +++ b/examples/multi_llm_agent.py @@ -1,12 +1,9 @@ import os -from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMRouterModel +from smolagents import CodeAgent, LiteLLMRouterModel, WebSearchTool -os.environ["OPENAI_API_KEY"] = "" -os.environ["AWS_ACCESS_KEY_ID"] = "" -os.environ["AWS_SECRET_ACCESS_KEY"] = "" -os.environ["AWS_REGION"] = "" +# Make sure to setup the necessary environment variables! llm_loadbalancer_model_list = [ { @@ -42,6 +39,6 @@ model_list=llm_loadbalancer_model_list, client_kwargs={"routing_strategy": "simple-shuffle"}, ) -agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) +agent = CodeAgent(tools=[WebSearchTool()], model=model, stream_outputs=True) agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") diff --git a/examples/multiple_tools.py b/examples/multiple_tools.py index a2685541f..1a56e3519 100644 --- a/examples/multiple_tools.py +++ b/examples/multiple_tools.py @@ -243,11 +243,12 @@ def search_wikipedia(query: str) -> str: search_wikipedia, ], model=model, + stream_outputs=True, ) # Uncomment the line below to run the agent with a specific query -agent.run("5000 dollars to Euros") +agent.run("Convert 5000 dollars to Euros") # agent.run("What is the weather in New York?") # agent.run("Give me the top news headlines") # agent.run("Tell me a joke") diff --git a/examples/open_deep_research/README.md b/examples/open_deep_research/README.md index c2c799616..bfd372faf 100644 --- a/examples/open_deep_research/README.md +++ b/examples/open_deep_research/README.md @@ -51,4 +51,14 @@ For example, to use the default `o1` model, you need to set the `OPENAI_API_KEY` Then you're good to go! Run the run.py script, as in: ```bash python run.py --model-id "o1" "Your question here!" -``` \ No newline at end of file +``` + +## Full reproducibility of results + +The data used in our submissions to GAIA was augmented in this way: + - For each single-page .pdf or .xls file, it was opened in a file reader (MacOS Sonoma Numbers or Preview), and a ".png" screenshot was taken and added to the folder. +- Then for any file used in a question, the file loading system checks if there is a ".png" extension version of the file, and loads it instead of the original if it exists. + +This process was done manually but could be automatized. + +After processing, the annotated was uploaded to a [new dataset](https://huggingface.co/datasets/smolagents/GAIA-annotated). You need to request access (granted instantly). \ No newline at end of file diff --git a/examples/open_deep_research/analysis.ipynb b/examples/open_deep_research/analysis.ipynb index ccb6a1d54..53e7bd430 100644 --- a/examples/open_deep_research/analysis.ipynb +++ b/examples/open_deep_research/analysis.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "# !pip install plotly kaleido datasets nbformat -U -q" + "!pip install plotly kaleido datasets nbformat -U -q" ] }, { @@ -28,7 +28,7 @@ "\n", "pd.set_option(\"max_colwidth\", None)\n", "\n", - "OUTPUT_DIR = \"../../output\"" + "OUTPUT_DIR = \"output\"" ] }, { @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -134,31 +134,6 @@ "# result_df[\"tool_calls\"] = result_df[\"intermediate_steps\"].apply(sum_tool_calls)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_thoughts(x):\n", - " try:\n", - " output = x[0][\"task\"]\n", - " for y in x[1:]:\n", - " try:\n", - " if \"observation\" in y:\n", - " output += y[\"llm_output\"] + \"\\nObservation:\" + y[\"observation\"]\n", - " else:\n", - " output += y[\"llm_output\"] + r\"\\Error:\" + str(y[\"error\"])\n", - " except Exception:\n", - " pass\n", - " return output\n", - " except Exception:\n", - " return None\n", - "\n", - "\n", - "result_df[\"thoughts\"] = result_df[\"intermediate_steps\"].apply(lambda x: get_thoughts(x))" - ] - }, { "cell_type": "code", "execution_count": null, @@ -460,9 +435,9 @@ ], "metadata": { "kernelspec": { - "display_name": "test", + "display_name": "agents", "language": "python", - "name": "test" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/examples/open_deep_research/requirements.txt b/examples/open_deep_research/requirements.txt index 4fe0e0e2a..fe6c98ef2 100644 --- a/examples/open_deep_research/requirements.txt +++ b/examples/open_deep_research/requirements.txt @@ -20,7 +20,6 @@ pypdf>=5.1.0 python-dotenv>=1.0.1 python_pptx>=1.0.2 Requests>=2.32.3 -serpapi>=0.1.5 tqdm>=4.66.4 torch>=2.2.2 torchvision>=0.17.2 @@ -37,4 +36,4 @@ PyPDF2 python-pptx torch xlrd -SpeechRecognition \ No newline at end of file +SpeechRecognition diff --git a/examples/open_deep_research/run_gaia.py b/examples/open_deep_research/run_gaia.py index 192081787..9c7bacd4e 100644 --- a/examples/open_deep_research/run_gaia.py +++ b/examples/open_deep_research/run_gaia.py @@ -1,4 +1,4 @@ -# EXAMPLE COMMAND: python examples/open_deep_research/run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o +# EXAMPLE COMMAND: from folder examples/open_deep_research, run: python run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o import argparse import json import os @@ -6,11 +6,12 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from pathlib import Path +from typing import Any import datasets import pandas as pd from dotenv import load_dotenv -from huggingface_hub import login +from huggingface_hub import login, snapshot_download from scripts.reformulator import prepare_response from scripts.run_agents import ( get_single_file_description, @@ -49,35 +50,18 @@ def parse_args(): parser.add_argument("--concurrency", type=int, default=8) parser.add_argument("--model-id", type=str, default="o1") parser.add_argument("--run-name", type=str, required=True) + parser.add_argument("--set-to-run", type=str, default="validation") + parser.add_argument("--use-open-models", type=bool, default=False) + parser.add_argument("--use-raw-dataset", action="store_true") return parser.parse_args() ### IMPORTANT: EVALUATION SWITCHES -print("Make sure you deactivated Tailscale VPN, else some URLs will be blocked!") - -USE_OPEN_MODELS = False - -SET = "validation" +print("Make sure you deactivated any VPN like Tailscale, else some URLs will be blocked!") custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"} -### LOAD EVALUATION DATASET - -eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET] -eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"}) - - -def preprocess_file_paths(row): - if len(row["file_name"]) > 0: - row["file_name"] = f"data/gaia/{SET}/" + row["file_name"] - return row - - -eval_ds = eval_ds.map(preprocess_file_paths) -eval_df = pd.DataFrame(eval_ds) -print("Loaded evaluation dataset:") -print(eval_df["task"].value_counts()) user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" @@ -142,17 +126,54 @@ def create_agent_team(model: Model): return manager_agent +def load_gaia_dataset(use_raw_dataset: bool, set_to_run: str) -> datasets.Dataset: + if not os.path.exists("data/gaia"): + if use_raw_dataset: + snapshot_download( + repo_id="gaia-benchmark/GAIA", + repo_type="dataset", + local_dir="data/gaia", + ignore_patterns=[".gitattributes", "README.md"], + ) + else: + # WARNING: this dataset is gated: make sure you visit the repo to require access. + snapshot_download( + repo_id="smolagents/GAIA-annotated", + repo_type="dataset", + local_dir="data/gaia", + ignore_patterns=[".gitattributes", "README.md"], + ) + + def preprocess_file_paths(row): + if len(row["file_name"]) > 0: + row["file_name"] = f"data/gaia/{set_to_run}/" + row["file_name"] + return row + + eval_ds = datasets.load_dataset( + "data/gaia/GAIA.py", + name="2023_all", + split=set_to_run, + # data_files={"validation": "validation/metadata.jsonl", "test": "test/metadata.jsonl"}, + ) + + eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"}) + eval_ds = eval_ds.map(preprocess_file_paths) + return eval_ds + + def append_answer(entry: dict, jsonl_file: str) -> None: - jsonl_file = Path(jsonl_file) - jsonl_file.parent.mkdir(parents=True, exist_ok=True) + jsonl_path = Path(jsonl_file) + jsonl_path.parent.mkdir(parents=True, exist_ok=True) with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp: fp.write(json.dumps(entry) + "\n") - assert os.path.exists(jsonl_file), "File not found!" - print("Answer exported to file:", jsonl_file.resolve()) + assert jsonl_path.exists(), "File not found!" + print("Answer exported to file:", jsonl_path.resolve()) -def answer_single_question(example, model_id, answers_file, visual_inspection_tool): - model_params = { +def answer_single_question( + example: dict, model_id: str, answers_file: str, visual_inspection_tool: TextInspectorTool +) -> None: + model_params: dict[str, Any] = { "model_id": model_id, "custom_role_conversions": custom_role_conversions, } @@ -162,15 +183,16 @@ def answer_single_question(example, model_id, answers_file, visual_inspection_to else: model_params["max_tokens"] = 4096 model = LiteLLMModel(**model_params) - # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together", max_tokens=4096) + # model = InferenceClientModel(model_id="Qwen/Qwen3-32B", provider="novita", max_tokens=4096) document_inspection_tool = TextInspectorTool(model, 100000) agent = create_agent_team(model) augmented_question = """You have one question to answer. It is paramount that you provide a correct answer. -Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded. -Run verification steps if that's needed, you must make sure you find the correct answer! -Here is the task: +Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). +Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded. +Run verification steps if that's needed, you must make sure you find the correct answer! Here is the task: + """ + example["question"] if example["file_name"]: @@ -180,7 +202,7 @@ def answer_single_question(example, model_id, answers_file, visual_inspection_to example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool ) else: - prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:" + prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:\n" prompt_use_files += get_single_file_description( example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool ) @@ -241,7 +263,7 @@ def answer_single_question(example, model_id, answers_file, visual_inspection_to append_answer(annotated_example, answers_file) -def get_examples_to_answer(answers_file, eval_ds) -> list[dict]: +def get_examples_to_answer(answers_file: str, eval_ds: datasets.Dataset) -> list[dict]: print(f"Loading answers from {answers_file}...") try: done_questions = pd.read_json(answers_file, lines=True)["question"].tolist() @@ -250,14 +272,18 @@ def get_examples_to_answer(answers_file, eval_ds) -> list[dict]: print("Error when loading records: ", e) print("No usable records! ▶️ Starting new.") done_questions = [] - return [line for line in eval_ds.to_list() if line["question"] not in done_questions] + return [line for line in eval_ds.to_list() if line["question"] not in done_questions and line["file_name"]] def main(): args = parse_args() print(f"Starting run with arguments: {args}") - answers_file = f"output/{SET}/{args.run_name}.jsonl" + eval_ds = load_gaia_dataset(args.use_raw_dataset, args.set_to_run) + print("Loaded evaluation dataset:") + print(pd.DataFrame(eval_ds)["task"].value_counts()) + + answers_file = f"output/{args.set_to_run}/{args.run_name}.jsonl" tasks_to_run = get_examples_to_answer(answers_file, eval_ds) with ThreadPoolExecutor(max_workers=args.concurrency) as exe: diff --git a/examples/open_deep_research/scripts/run_agents.py b/examples/open_deep_research/scripts/run_agents.py index 37da8a40e..e2e020cb3 100644 --- a/examples/open_deep_research/scripts/run_agents.py +++ b/examples/open_deep_research/scripts/run_agents.py @@ -38,12 +38,13 @@ def get_single_file_description(file_path: str, question: str, visual_inspection ) return file_description elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]: - file_description = f" - Attached document: {file_path}" image_path = file_path.split(".")[0] + ".png" if os.path.exists(image_path): description = get_image_description(image_path, question, visual_inspection_tool) + file_path = image_path else: description = get_document_description(file_path, question, document_inspection_tool) + file_description = f" - Attached document: {file_path}" file_description += f"\n -> File description: {description}" return file_description elif file_extension in ["mp3", "m4a", "wav"]: diff --git a/examples/open_deep_research/visual_vs_text_browser.ipynb b/examples/open_deep_research/visual_vs_text_browser.ipynb index 4a85a465a..1acc5c704 100644 --- a/examples/open_deep_research/visual_vs_text_browser.ipynb +++ b/examples/open_deep_research/visual_vs_text_browser.ipynb @@ -15,7 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install \"smolagents[litellm]\" -q" + "!pip install \"smolagents[litellm,toolkit]\" -q" ] }, { @@ -167,7 +167,7 @@ "source": [ "from scripts.visual_qa import VisualQAGPT4Tool\n", "\n", - "from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n", + "from smolagents import CodeAgent, LiteLLMModel, WebSearchTool\n", "from smolagents.vision_web_browser import (\n", " close_popups,\n", " go_back,\n", @@ -183,7 +183,7 @@ "### BUILD AGENTS & TOOLS\n", "\n", "CodeAgent(\n", - " tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n", + " tools=[WebSearchTool(), go_back, close_popups, search_item_ctrl_f],\n", " model=proprietary_model,\n", " additional_authorized_imports=[\"helium\"],\n", " step_callbacks=[save_screenshot],\n", diff --git a/examples/rag.py b/examples/rag.py index 3ff572fb3..6f378d11e 100644 --- a/examples/rag.py +++ b/examples/rag.py @@ -58,9 +58,10 @@ def forward(self, query: str) -> str: retriever_tool = RetrieverTool(docs_processed) agent = CodeAgent( tools=[retriever_tool], - model=InferenceClientModel(model_id="meta-llama/Llama-3.3-70B-Instruct"), + model=InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct"), max_steps=4, verbosity_level=2, + stream_outputs=True, ) agent_output = agent.run("For a transformers model training, which is slower, the forward or the backward pass?") diff --git a/examples/sandboxed_execution.py b/examples/sandboxed_execution.py index 25e4fb771..4bd81f029 100644 --- a/examples/sandboxed_execution.py +++ b/examples/sandboxed_execution.py @@ -1,12 +1,12 @@ -from smolagents import CodeAgent, DuckDuckGoSearchTool, InferenceClientModel +from smolagents import CodeAgent, InferenceClientModel, WebSearchTool model = InferenceClientModel() -agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model, executor_type="docker") +agent = CodeAgent(tools=[WebSearchTool()], model=model, executor_type="docker") output = agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") print("Docker executor result:", output) -agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model, executor_type="e2b") +agent = CodeAgent(tools=[WebSearchTool()], model=model, executor_type="e2b") output = agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") print("E2B executor result:", output) diff --git a/pyproject.toml b/pyproject.toml index 0db6ab2b8..6b3f2d111 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "smolagents" -version = "1.15.0.dev0" +version = "1.15.0" description = "🤗 smolagents: a barebones library for agents. Agents write python code to call tools or orchestrate other agents." authors = [ { name="Aymeric Roucher", email="aymeric@hf.co" }, @@ -12,13 +12,11 @@ authors = [ readme = "README.md" requires-python = ">=3.10" dependencies = [ - "huggingface-hub>=0.28.0", + "huggingface-hub>=0.30.0", "requests>=2.32.3", "rich>=13.9.4", "jinja2>=3.1.4", - "pillow>=11.0.0", - "markdownify>=0.14.1", - "duckduckgo-search>=6.3.7", + "pillow>=10.0.1", # Security fix for CVE-2023-4863: https://pillow.readthedocs.io/en/stable/releasenotes/10.0.1.html "python-dotenv" ] @@ -65,6 +63,10 @@ telemetry = [ "opentelemetry-exporter-otlp", "openinference-instrumentation-smolagents>=0.1.4" ] +toolkit = [ + "duckduckgo-search>=6.3.7", # DuckDuckGoSearchTool + "markdownify>=0.14.1", # VisitWebpageTool +] transformers = [ "accelerate", "transformers>=4.0.0", @@ -79,7 +81,7 @@ vllm = [ "torch" ] all = [ - "smolagents[audio,docker,e2b,gradio,litellm,mcp,mlx-lm,openai,telemetry,transformers,vision,bedrock]", + "smolagents[audio,docker,e2b,gradio,litellm,mcp,mlx-lm,openai,telemetry,toolkit,transformers,vision,bedrock]", ] quality = [ "ruff>=0.9.0", diff --git a/src/smolagents/__init__.py b/src/smolagents/__init__.py index be4c3c19e..a649b4572 100644 --- a/src/smolagents/__init__.py +++ b/src/smolagents/__init__.py @@ -14,7 +14,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.15.0.dev0" +__version__ = "1.15.0" from .agent_types import * # noqa: I001 from .agents import * # Above noqa avoids a circular dependency due to cli.py diff --git a/src/smolagents/_function_type_hints_utils.py b/src/smolagents/_function_type_hints_utils.py index e5a367c08..d3045420e 100644 --- a/src/smolagents/_function_type_hints_utils.py +++ b/src/smolagents/_function_type_hints_utils.py @@ -38,6 +38,24 @@ ) +IMPORT_TO_PACKAGE_MAPPING = { + "wikipediaapi": "wikipedia-api", +} + + +def get_package_name(import_name: str) -> str: + """ + Return the package name for a given import name. + + Args: + import_name (`str`): Import name to get the package name for. + + Returns: + `str`: Package name for the given import name. + """ + return IMPORT_TO_PACKAGE_MAPPING.get(import_name, import_name) + + def get_imports(code: str) -> list[str]: """ Extracts all the libraries (not relative imports) that are imported in a code. @@ -65,7 +83,7 @@ def get_imports(code: str) -> list[str]: imports += re.findall(r"^\s*from\s+(\S+)\s+import", code, flags=re.MULTILINE) # Only keep the top-level module imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")] - return list(set(imports)) + return [get_package_name(import_name) for import_name in set(imports)] class TypeHintParsingException(Exception): @@ -290,6 +308,14 @@ def _convert_type_hints_to_json_schema(func: Callable, error_on_missing_type_hin else: properties[param_name]["nullable"] = True + # Return: multi‐type union -> treat as any + if ( + "return" in properties + and (return_type := properties["return"].get("type")) + and not isinstance(return_type, str) + ): + properties["return"]["type"] = "any" + schema = {"type": "object", "properties": properties} if required: schema["required"] = required diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py index 8ff8eb230..a276018f5 100644 --- a/src/smolagents/agents.py +++ b/src/smolagents/agents.py @@ -23,7 +23,6 @@ import textwrap import time from abc import ABC, abstractmethod -from collections import deque from collections.abc import Callable, Generator from logging import getLogger from pathlib import Path @@ -57,7 +56,7 @@ TaskStep, ToolCall, ) -from .models import ChatMessage, MessageRole, Model, parse_json_if_needed +from .models import ChatMessage, ChatMessageStreamDelta, MessageRole, Model, parse_json_if_needed from .monitoring import ( YELLOW_HEX, AgentLogger, @@ -177,7 +176,6 @@ class MultiStepAgent(ABC): model (`Callable[[list[dict[str, str]]], ChatMessage]`): Model that will generate the agent's actions. prompt_templates ([`~agents.PromptTemplates`], *optional*): Prompt templates. max_steps (`int`, default `20`): Maximum number of steps the agent can take to solve the task. - tool_parser (`Callable`, *optional*): Function used to parse the tool calls from the LLM output. add_base_tools (`bool`, default `False`): Whether to add the base tools to the agent's tools. verbosity_level (`LogLevel`, default `LogLevel.INFO`): Level of verbosity of the agent's logs. grammar (`dict[str, str]`, *optional*): Grammar used to parse the LLM output. @@ -249,6 +247,7 @@ def __init__( self.monitor = Monitor(self.model, self.logger) self.step_callbacks = step_callbacks if step_callbacks is not None else [] self.step_callbacks.append(self.monitor.update_metrics) + self.stream_outputs = False def _validate_name(self, name: str | None) -> str | None: if name is not None and not is_valid_name(name): @@ -347,11 +346,11 @@ def run( if stream: # The steps are returned as they are executed through a generator to iterate on. - return self._run(task=self.task, max_steps=max_steps, images=images) + return self._run_stream(task=self.task, max_steps=max_steps, images=images) # Outputs are returned only at the end. We only look at the last step. - return deque(self._run(task=self.task, max_steps=max_steps, images=images), maxlen=1)[0].final_answer + return list(self._run_stream(task=self.task, max_steps=max_steps, images=images))[-1].final_answer - def _run( + def _run_stream( self, task: str, max_steps: int, images: list["PIL.Image.Image"] | None = None ) -> Generator[ActionStep | PlanningStep | FinalAnswerStep]: final_answer = None @@ -363,16 +362,18 @@ def _run( if self.planning_interval is not None and ( self.step_number == 1 or (self.step_number - 1) % self.planning_interval == 0 ): - planning_step = self._generate_planning_step( + for element in self._generate_planning_step( task, is_first_step=(self.step_number == 1), step=self.step_number - ) - self.memory.steps.append(planning_step) - yield planning_step + ): + yield element + self.memory.steps.append(element) action_step = ActionStep( step_number=self.step_number, start_time=step_start_time, observations_images=images ) try: - final_answer = self._execute_step(task, action_step) + for el in self._execute_step(action_step): + yield el + final_answer = el except AgentGenerationError as e: # Agent generation errors are not caused by a Model error but an implementation error: so we should raise them and exit. raise e @@ -390,12 +391,15 @@ def _run( yield action_step yield FinalAnswerStep(handle_agent_output_types(final_answer)) - def _execute_step(self, task: str, memory_step: ActionStep) -> None | Any: + def _execute_step(self, memory_step: ActionStep) -> Generator[Any]: self.logger.log_rule(f"Step {self.step_number}", level=LogLevel.INFO) - final_answer = self.step(memory_step) + final_answer = None + for el in self._step_stream(memory_step): + final_answer = el + yield el if final_answer is not None and self.final_answer_checks: self._validate_final_answer(final_answer) - return final_answer + yield final_answer def _validate_final_answer(self, final_answer: Any): for check_function in self.final_answer_checks: @@ -428,7 +432,9 @@ def _handle_max_steps_reached(self, task: str, images: list["PIL.Image.Image"], ) return final_answer - def _generate_planning_step(self, task, is_first_step: bool, step: int) -> PlanningStep: + def _generate_planning_step( + self, task, is_first_step: bool, step: int + ) -> Generator[ChatMessageStreamDelta, PlanningStep]: if is_first_step: input_messages = [ { @@ -444,9 +450,15 @@ def _generate_planning_step(self, task, is_first_step: bool, step: int) -> Plann ], } ] - plan_message = self.model(input_messages, stop_sequences=[""]) + if self.stream_outputs and hasattr(self.model, "generate_stream"): + plan_message_content = "" + for completion_delta in self.model.generate_stream(input_messages, stop_sequences=[""]): # type: ignore + plan_message_content += completion_delta.content + yield completion_delta + else: + plan_message_content = self.model.generate(input_messages, stop_sequences=[""]).content plan = textwrap.dedent( - f"""Here are the facts I know and the plan of action that I will follow to solve the task:\n```\n{plan_message.content}\n```""" + f"""Here are the facts I know and the plan of action that I will follow to solve the task:\n```\n{plan_message_content}\n```""" ) else: # Summary mode removes the system prompt and previous planning messages output by the model. @@ -481,16 +493,22 @@ def _generate_planning_step(self, task, is_first_step: bool, step: int) -> Plann ], } input_messages = [plan_update_pre] + memory_messages + [plan_update_post] - plan_message = self.model(input_messages, stop_sequences=[""]) + if self.stream_outputs and hasattr(self.model, "generate_stream"): + plan_message_content = "" + for completion_delta in self.model.generate_stream(input_messages, stop_sequences=[""]): # type: ignore + plan_message_content += completion_delta.content + yield completion_delta + else: + plan_message_content = self.model.generate(input_messages, stop_sequences=[""]).content plan = textwrap.dedent( - f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere are the facts I know and my new/updated plan of action to solve the task:\n```\n{plan_message.content}\n```""" + f"""I still need to solve the task I was given:\n```\n{self.task}\n```\n\nHere are the facts I know and my new/updated plan of action to solve the task:\n```\n{plan_message_content}\n```""" ) log_headline = "Initial plan" if is_first_step else "Updated plan" self.logger.log(Rule(f"[bold]{log_headline}", style="orange"), Text(plan), level=LogLevel.INFO) - return PlanningStep( + yield PlanningStep( model_input_messages=input_messages, plan=plan, - model_output_message=plan_message, + model_output_message=ChatMessage(role=MessageRole.ASSISTANT, content=plan_message_content), ) @property @@ -523,9 +541,19 @@ def write_memory_to_messages( messages.extend(memory_step.to_messages(summary_mode=summary_mode)) return messages - def visualize(self): - """Creates a rich tree visualization of the agent's structure.""" - self.logger.visualize_agent_tree(self) + def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: + """ + Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. + Yields either None if the step is not final, or the final answer. + """ + raise NotImplementedError("This method should be implemented in child classes") + + def step(self, memory_step: ActionStep) -> Any: + """ + Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. + Returns either None if the step is not final, or the final answer. + """ + return list(self._step_stream(memory_step))[-1] def extract_action(self, model_output: str, split_token: str) -> tuple[str, str]: """ @@ -592,10 +620,9 @@ def provide_final_answer(self, task: str, images: list["PIL.Image.Image"] | None except Exception as e: return f"Error in generating final LLM output:\n{e}" - @abstractmethod - def step(self, memory_step: ActionStep) -> None | Any: - """To be implemented in children classes. Should return either None if the step is not final.""" - pass + def visualize(self): + """Creates a rich tree visualization of the agent's structure.""" + self.logger.visualize_agent_tree(self) def replay(self, detailed: bool = False): """Prints a pretty replay of the agent's steps. @@ -1008,10 +1035,10 @@ def initialize_system_prompt(self) -> str: ) return system_prompt - def step(self, memory_step: ActionStep) -> None | Any: + def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. - Returns None if the step is not final. + Yields either None if the step is not final, or the final answer. """ memory_messages = self.write_memory_to_messages() @@ -1066,23 +1093,23 @@ def step(self, memory_step: ActionStep) -> None | Any: answer = tool_arguments else: answer = tool_arguments - if ( - isinstance(answer, str) and answer in self.state.keys() - ): # if the answer is a state variable, return the value + if isinstance(answer, str) and answer in self.state.keys(): + # if the answer is a state variable, return the value + # State variables are not JSON-serializable (AgentImage, AgentAudio) so can't be passed as arguments to execute_tool_call final_answer = self.state[answer] self.logger.log( f"[bold {YELLOW_HEX}]Final answer:[/bold {YELLOW_HEX}] Extracting key '{answer}' from state to return value '{final_answer}'.", level=LogLevel.INFO, ) else: - final_answer = answer + final_answer = self.execute_tool_call("final_answer", {"answer": answer}) self.logger.log( Text(f"Final answer: {final_answer}", style=f"bold {YELLOW_HEX}"), level=LogLevel.INFO, ) memory_step.action_output = final_answer - return final_answer + yield final_answer else: if tool_arguments is None: tool_arguments = {} @@ -1104,7 +1131,7 @@ def step(self, memory_step: ActionStep) -> None | Any: level=LogLevel.INFO, ) memory_step.observations = updated_information - return None + yield None def _substitute_state_variables(self, arguments: dict[str, str] | str) -> dict[str, Any] | str: """Replace string values in arguments with their corresponding state values if they exist.""" @@ -1272,10 +1299,10 @@ def initialize_system_prompt(self) -> str: ) return system_prompt - def step(self, memory_step: ActionStep) -> None | Any: + def _step_stream(self, memory_step: ActionStep) -> Generator[Any]: """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. - Returns None if the step is not final. + Yields either None if the step is not final, or the final answer. """ memory_messages = self.write_memory_to_messages() @@ -1296,13 +1323,14 @@ def step(self, memory_step: ActionStep) -> None | Any: if event.content is not None: output_text += event.content live.update(Markdown(output_text)) + yield event model_output = output_text chat_message = ChatMessage(role="assistant", content=model_output) memory_step.model_output_message = chat_message model_output = chat_message.content else: - chat_message: ChatMessage = self.model( + chat_message: ChatMessage = self.model.generate( input_messages, stop_sequences=["", "Observation:", "Calling tools:"], **additional_args, @@ -1382,7 +1410,7 @@ def step(self, memory_step: ActionStep) -> None | Any: ] self.logger.log(Group(*execution_outputs_console), level=LogLevel.INFO) memory_step.action_output = output - return output if is_final_answer else None + yield output if is_final_answer else None def to_dict(self) -> dict[str, Any]: """Convert the agent to a dictionary representation. diff --git a/src/smolagents/default_tools.py b/src/smolagents/default_tools.py index d12a38d5a..617dee2d1 100644 --- a/src/smolagents/default_tools.py +++ b/src/smolagents/default_tools.py @@ -210,6 +210,87 @@ def forward(self, query: str, filter_year: int | None = None) -> str: return "## Search Results\n" + "\n\n".join(web_snippets) +class WebSearchTool(Tool): + name = "web_search" + description = "Performs a web search for a query and returns a string of the top search results formatted as markdown with titles, links, and descriptions." + inputs = {"query": {"type": "string", "description": "The search query to perform."}} + output_type = "string" + + def __init__(self, max_results=10): + super().__init__() + self.max_results = max_results + + def forward(self, query: str) -> str: + results = self.search_duckduckgo(query) + if len(results) == 0: + raise Exception("No results found! Try a less restrictive/shorter query.") + return self.parse_results(results) + + def parse_results(self, results: list) -> str: + return "## Search Results\n\n" + "\n\n".join( + [f"[{result['title']}]({result['link']})\n{result['description']}" for result in results] + ) + + def search_duckduckgo(self, query: str) -> list: + import requests + + response = requests.get( + "https://lite.duckduckgo.com/lite/", + params={"q": query}, + headers={"User-Agent": "Mozilla/5.0"}, + ) + response.raise_for_status() + parser = self._create_duckduckgo_parser() + parser.feed(response.text) + return parser.results + + def _create_duckduckgo_parser(self): + from html.parser import HTMLParser + + class SimpleResultParser(HTMLParser): + def __init__(self): + super().__init__() + self.results = [] + self.current = {} + self.capture_title = False + self.capture_description = False + self.capture_link = False + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if tag == "a" and attrs.get("class") == "result-link": + self.capture_title = True + elif tag == "td" and attrs.get("class") == "result-snippet": + self.capture_description = True + elif tag == "span" and attrs.get("class") == "link-text": + self.capture_link = True + + def handle_endtag(self, tag): + if tag == "a" and self.capture_title: + self.capture_title = False + elif tag == "td" and self.capture_description: + self.capture_description = False + elif tag == "span" and self.capture_link: + self.capture_link = False + elif tag == "tr": + # Store current result if all parts are present + if {"title", "description", "link"} <= self.current.keys(): + self.current["description"] = " ".join(self.current["description"]) + self.results.append(self.current) + self.current = {} + + def handle_data(self, data): + if self.capture_title: + self.current["title"] = data.strip() + elif self.capture_description: + self.current.setdefault("description", []) + self.current["description"].append(data.strip()) + elif self.capture_link: + self.current["link"] = "https://" + data.strip() + + return SimpleResultParser() + + class VisitWebpageTool(Tool): name = "visit_webpage" description = ( @@ -227,6 +308,15 @@ def __init__(self, max_output_length: int = 40000): super().__init__() self.max_output_length = max_output_length + def _truncate_content(self, content: str, max_length: int) -> str: + if len(content) <= max_length: + return content + return ( + content[: max_length // 2] + + f"\n..._This content has been truncated to stay below {max_length} characters_...\n" + + content[-max_length // 2 :] + ) + def forward(self, url: str) -> str: try: import re @@ -234,8 +324,6 @@ def forward(self, url: str) -> str: import requests from markdownify import markdownify from requests.exceptions import RequestException - - from smolagents.utils import truncate_content except ImportError as e: raise ImportError( "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`." @@ -251,7 +339,7 @@ def forward(self, url: str) -> str: # Remove multiple line breaks markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) - return truncate_content(markdown_content, self.max_output_length) + return self._truncate_content(markdown_content, self.max_output_length) except requests.exceptions.Timeout: return "The request timed out. Please try again later or check the URL." @@ -405,6 +493,7 @@ def decode(self, outputs): "PythonInterpreterTool", "FinalAnswerTool", "UserInputTool", + "WebSearchTool", "DuckDuckGoSearchTool", "GoogleSearchTool", "VisitWebpageTool", diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index 83fbaff3d..ec089b397 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -16,10 +16,12 @@ import os import re import shutil +from pathlib import Path from smolagents.agent_types import AgentAudio, AgentImage, AgentText from smolagents.agents import MultiStepAgent, PlanningStep from smolagents.memory import ActionStep, FinalAnswerStep, MemoryStep +from smolagents.models import ChatMessageStreamDelta from smolagents.utils import _is_package_available @@ -36,10 +38,14 @@ def get_step_footnote_content(step_log: MemoryStep, step_name: str) -> str: return step_footnote_content -def pull_messages_from_step( - step_log: MemoryStep, -): - """Extract ChatMessage objects from agent steps with proper nesting""" +def pull_messages_from_step(step_log: MemoryStep, skip_model_outputs: bool = False): + """Extract ChatMessage objects from agent steps with proper nesting. + + Args: + step_log: The step log to display as gr.ChatMessage objects. + skip_model_outputs: If True, skip the model outputs when creating the gr.ChatMessage objects: + This is used for instance when streaming model outputs have already been displayed. + """ if not _is_package_available("gradio"): raise ModuleNotFoundError( "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" @@ -49,24 +55,23 @@ def pull_messages_from_step( if isinstance(step_log, ActionStep): # Output the step number step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else "Step" - yield gr.ChatMessage(role="assistant", content=f"**{step_number}**") # First yield the thought/reasoning from the LLM - if hasattr(step_log, "model_output") and step_log.model_output is not None: - # Clean up the LLM output + if not skip_model_outputs: + yield gr.ChatMessage(role="assistant", content=f"**{step_number}**", metadata={"status": "done"}) + elif skip_model_outputs and hasattr(step_log, "model_output") and step_log.model_output is not None: model_output = step_log.model_output.strip() # Remove any trailing and extra backticks, handling multiple possible formats model_output = re.sub(r"```\s*", "```", model_output) # handles ``` model_output = re.sub(r"\s*```", "```", model_output) # handles ``` model_output = re.sub(r"```\s*\n\s*", "```", model_output) # handles ```\n model_output = model_output.strip() - yield gr.ChatMessage(role="assistant", content=model_output) + yield gr.ChatMessage(role="assistant", content=model_output, metadata={"status": "done"}) # For tool calls, create a parent message if hasattr(step_log, "tool_calls") and step_log.tool_calls is not None: first_tool_call = step_log.tool_calls[0] used_code = first_tool_call.name == "python_interpreter" - parent_id = f"call_{len(step_log.tool_calls)}" # Tool call becomes the parent message with timing info # First we will handle arguments based on type @@ -89,7 +94,6 @@ def pull_messages_from_step( content=content, metadata={ "title": f"🛠️ Used tool {first_tool_call.name}", - "id": parent_id, "status": "done", }, ) @@ -128,15 +132,21 @@ def pull_messages_from_step( # Handle standalone errors but not from tool calls if hasattr(step_log, "error") and step_log.error is not None: - yield gr.ChatMessage(role="assistant", content=str(step_log.error), metadata={"title": "💥 Error"}) + yield gr.ChatMessage( + role="assistant", content=str(step_log.error), metadata={"title": "💥 Error", "status": "done"} + ) - yield gr.ChatMessage(role="assistant", content=get_step_footnote_content(step_log, step_number)) + yield gr.ChatMessage( + role="assistant", content=get_step_footnote_content(step_log, step_number), metadata={"status": "done"} + ) yield gr.ChatMessage(role="assistant", content="-----", metadata={"status": "done"}) elif isinstance(step_log, PlanningStep): - yield gr.ChatMessage(role="assistant", content="**Planning step**") - yield gr.ChatMessage(role="assistant", content=step_log.plan) - yield gr.ChatMessage(role="assistant", content=get_step_footnote_content(step_log, "Planning step")) + yield gr.ChatMessage(role="assistant", content="**Planning step**", metadata={"status": "done"}) + yield gr.ChatMessage(role="assistant", content=step_log.plan, metadata={"status": "done"}) + yield gr.ChatMessage( + role="assistant", content=get_step_footnote_content(step_log, "Planning step"), metadata={"status": "done"} + ) yield gr.ChatMessage(role="assistant", content="-----", metadata={"status": "done"}) elif isinstance(step_log, FinalAnswerStep): @@ -145,19 +155,24 @@ def pull_messages_from_step( yield gr.ChatMessage( role="assistant", content=f"**Final answer:**\n{final_answer.to_string()}\n", + metadata={"status": "done"}, ) elif isinstance(final_answer, AgentImage): yield gr.ChatMessage( role="assistant", content={"path": final_answer.to_string(), "mime_type": "image/png"}, + metadata={"status": "done"}, ) elif isinstance(final_answer, AgentAudio): yield gr.ChatMessage( role="assistant", content={"path": final_answer.to_string(), "mime_type": "audio/wav"}, + metadata={"status": "done"}, ) else: - yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}") + yield gr.ChatMessage( + role="assistant", content=f"**Final answer:** {str(final_answer)}", metadata={"status": "done"} + ) else: raise ValueError(f"Unsupported step type: {type(step_log)}") @@ -174,6 +189,13 @@ def stream_to_gradio( total_input_tokens = 0 total_output_tokens = 0 + if not _is_package_available("gradio"): + raise ModuleNotFoundError( + "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" + ) + + intermediate_text = "" + for step_log in agent.run( task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args ): @@ -185,10 +207,17 @@ def stream_to_gradio( step_log.input_token_count = agent.model.last_input_token_count step_log.output_token_count = agent.model.last_output_token_count - for message in pull_messages_from_step( - step_log, - ): - yield message + if isinstance(step_log, MemoryStep): + intermediate_text = "" + for message in pull_messages_from_step( + step_log, + # If we're streaming model outputs, no need to display them twice + skip_model_outputs=getattr(agent, "stream_outputs", False), + ): + yield message + elif isinstance(step_log, ChatMessageStreamDelta): + intermediate_text += step_log.content or "" + yield intermediate_text class GradioUI: @@ -200,12 +229,12 @@ def __init__(self, agent: MultiStepAgent, file_upload_folder: str | None = None) "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" ) self.agent = agent - self.file_upload_folder = file_upload_folder + self.file_upload_folder = Path(file_upload_folder) if file_upload_folder is not None else None self.name = getattr(agent, "name") or "Agent interface" self.description = getattr(agent, "description", None) if self.file_upload_folder is not None: - if not os.path.exists(file_upload_folder): - os.mkdir(file_upload_folder) + if not self.file_upload_folder.exists(): + self.file_upload_folder.mkdir(parents=True, exist_ok=True) def interact_with_agent(self, prompt, messages, session_state): import gradio as gr @@ -215,11 +244,22 @@ def interact_with_agent(self, prompt, messages, session_state): session_state["agent"] = self.agent try: - messages.append(gr.ChatMessage(role="user", content=prompt)) + messages.append(gr.ChatMessage(role="user", content=prompt, metadata={"status": "done"})) yield messages for msg in stream_to_gradio(session_state["agent"], task=prompt, reset_agent_memory=False): - messages.append(msg) + if isinstance(msg, gr.ChatMessage): + messages.append(msg) + elif isinstance(msg, str): # Then it's only a completion delta + try: + if messages[-1].metadata["status"] == "pending": + messages[-1].content = msg + else: + messages.append( + gr.ChatMessage(role="assistant", content=msg, metadata={"status": "pending"}) + ) + except Exception as e: + raise e yield messages yield messages @@ -309,12 +349,9 @@ def create_app(self): [upload_status, file_uploads_log], ) - gr.HTML("

Powered by:

") - with gr.Row(): - gr.HTML("""""") + gr.HTML( + "

Powered by smolagents

" + ) # Main chat interface chatbot = gr.Chatbot( diff --git a/src/smolagents/models.py b/src/smolagents/models.py index 433f8fbe5..6b9b40fff 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -139,7 +139,7 @@ def parse_json_if_needed(arguments: str | dict) -> str | dict: @dataclass -class CompletionDelta: +class ChatMessageStreamDelta: content: str | None = None tool_calls: list[ChatMessageToolCall] | None = None @@ -892,7 +892,7 @@ def generate_stream( grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, - ) -> Generator: + ) -> Generator[ChatMessageStreamDelta]: generation_kwargs = self._prepare_completion_args( messages=messages, stop_sequences=stop_sequences, @@ -909,7 +909,7 @@ def generate_stream( # Generate with streaming for new_text in self.streamer: - yield CompletionDelta(content=new_text, tool_calls=None) + yield ChatMessageStreamDelta(content=new_text, tool_calls=None) self.last_output_token_count += 1 self.last_input_token_count = count_prompt_tokens @@ -1044,7 +1044,7 @@ def generate_stream( grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, - ) -> Generator: + ) -> Generator[ChatMessageStreamDelta]: if tools_to_call_from: raise NotImplementedError("Streaming is not yet supported for tool calling") completion_kwargs = self._prepare_completion_kwargs( @@ -1063,7 +1063,7 @@ def generate_stream( if not getattr(event.choices[0], "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") else: - yield CompletionDelta( + yield ChatMessageStreamDelta( content=event.choices[0].delta.content, ) if getattr(event, "usage", None): @@ -1099,7 +1099,7 @@ class LiteLLMRouterModel(LiteLLMModel): Example: ```python >>> import os - >>> from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMRouterModel + >>> from smolagents import CodeAgent, WebSearchTool, LiteLLMRouterModel >>> os.environ["OPENAI_API_KEY"] = "" >>> os.environ["AWS_ACCESS_KEY_ID"] = "" >>> os.environ["AWS_SECRET_ACCESS_KEY"] = "" @@ -1129,7 +1129,7 @@ class LiteLLMRouterModel(LiteLLMModel): ... "routing_strategy":"simple-shuffle" ... } >>> ) - >>> agent = CodeAgent(tools=[DuckDuckGoSearchTool()], model=model) + >>> agent = CodeAgent(tools=[WebSearchTool()], model=model) >>> agent.run("How many seconds would it take for a leopard at full speed to run through Pont des Arts?") ``` """ @@ -1193,6 +1193,9 @@ class InferenceClientModel(ApiModel): api_key (`str`, *optional*): Token to use for authentication. This is a duplicated argument from `token` to make [`InferenceClientModel`] follow the same pattern as `openai.OpenAI` client. Cannot be used if `token` is set. Defaults to None. + bill_to (`str`, *optional*): + The billing account to use for the requests. By default the requests are billed on the user’s account. Requests can only be billed to + an organization the user is a member of, and which has subscribed to Enterprise Hub. **kwargs: Additional keyword arguments to pass to the Hugging Face API. @@ -1224,6 +1227,7 @@ def __init__( client_kwargs: dict[str, Any] | None = None, custom_role_conversions: dict[str, str] | None = None, api_key: str | None = None, + bill_to: str | None = None, **kwargs, ): if token is not None and api_key is not None: @@ -1241,6 +1245,7 @@ def __init__( "provider": provider, "token": token, "timeout": timeout, + "bill_to": bill_to, } super().__init__(model_id=model_id, custom_role_conversions=custom_role_conversions, **kwargs) @@ -1280,7 +1285,7 @@ def generate_stream( grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, - ) -> Generator: + ) -> Generator[ChatMessageStreamDelta]: if tools_to_call_from: raise NotImplementedError("Streaming is not yet supported for tool calling") completion_kwargs = self._prepare_completion_kwargs( @@ -1301,7 +1306,7 @@ def generate_stream( if not getattr(event.choices[0], "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") else: - yield CompletionDelta( + yield ChatMessageStreamDelta( content=event.choices[0].delta.content, ) if getattr(event, "usage", None): @@ -1386,7 +1391,7 @@ def generate_stream( grammar: str | None = None, tools_to_call_from: list[Tool] | None = None, **kwargs, - ) -> Generator: + ) -> Generator[ChatMessageStreamDelta]: if tools_to_call_from: raise NotImplementedError("Streaming is not yet supported for tool calling") completion_kwargs = self._prepare_completion_kwargs( @@ -1407,7 +1412,7 @@ def generate_stream( if not getattr(event.choices[0], "finish_reason", None): raise ValueError(f"No content or tool calls in event: {event}") else: - yield CompletionDelta( + yield ChatMessageStreamDelta( content=event.choices[0].delta.content, ) if getattr(event, "usage", None): diff --git a/src/smolagents/remote_executors.py b/src/smolagents/remote_executors.py index acfe70020..aed0445d1 100644 --- a/src/smolagents/remote_executors.py +++ b/src/smolagents/remote_executors.py @@ -53,19 +53,20 @@ def run_code_raise_errors(self, code: str, return_final_answer: bool = False) -> raise NotImplementedError def send_tools(self, tools: dict[str, Tool]): - tool_definition_code = get_tools_definition_code(tools) - - packages_to_install = set() - for tool in tools.values(): - for package in tool.to_dict()["requirements"]: - if package not in self.installed_packages: - packages_to_install.add(package) - self.installed_packages.append(package) - - execution = self.run_code_raise_errors( - f"!pip install {' '.join(packages_to_install)}\n" + tool_definition_code - ) - self.logger.log(execution[1]) + # Install tool packages + packages_to_install = { + pkg + for tool in tools.values() + for pkg in tool.to_dict()["requirements"] + if pkg not in self.installed_packages + ["smolagents"] + } + if packages_to_install: + self.installed_packages += self.install_packages(list(packages_to_install)) + # Get tool definitions + code = get_tools_definition_code(tools) + if code: + execution = self.run_code_raise_errors(code) + self.logger.log(execution[1]) def send_variables(self, variables: dict): """ @@ -86,9 +87,9 @@ def __call__(self, code_action: str) -> tuple[Any, str, bool]: return output[0], output[1], is_final_answer def install_packages(self, additional_imports: list[str]): - additional_imports = additional_imports + ["smolagents"] - _, execution_logs = self.run_code_raise_errors(f"!pip install {' '.join(additional_imports)}") - self.logger.log(execution_logs) + if additional_imports: + _, execution_logs = self.run_code_raise_errors(f"!pip install {' '.join(additional_imports)}") + self.logger.log(execution_logs) return additional_imports @@ -217,14 +218,18 @@ def __init__( dockerfile_path = Path(__file__).parent / "Dockerfile" if not dockerfile_path.exists(): with open(dockerfile_path, "w") as f: - f.write("""FROM python:3.12-slim - -RUN pip install jupyter_kernel_gateway requests numpy pandas -RUN pip install jupyter_client notebook - -EXPOSE 8888 -CMD ["jupyter", "kernelgateway", "--KernelGatewayApp.ip='0.0.0.0'", "--KernelGatewayApp.port=8888", "--KernelGatewayApp.allow_origin='*'"] -""") + f.write( + dedent( + """\ + FROM python:3.12-slim + + RUN pip install jupyter_kernel_gateway jupyter_client + + EXPOSE 8888 + CMD ["jupyter", "kernelgateway", "--KernelGatewayApp.ip='0.0.0.0'", "--KernelGatewayApp.port=8888", "--KernelGatewayApp.allow_origin='*'"] + """ + ) + ) _, build_logs = self.client.images.build( path=str(dockerfile_path.parent), dockerfile=str(dockerfile_path), tag=self.image_name ) diff --git a/src/smolagents/vision_web_browser.py b/src/smolagents/vision_web_browser.py index 8886ec97e..ace21d19d 100644 --- a/src/smolagents/vision_web_browser.py +++ b/src/smolagents/vision_web_browser.py @@ -9,7 +9,7 @@ from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys -from smolagents import CodeAgent, DuckDuckGoSearchTool, tool +from smolagents import CodeAgent, WebSearchTool, tool from smolagents.agents import ActionStep from smolagents.cli import load_model @@ -120,7 +120,7 @@ def initialize_driver(): def initialize_agent(model): """Initialize the CodeAgent with the specified model.""" return CodeAgent( - tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], + tools=[WebSearchTool(), go_back, close_popups, search_item_ctrl_f], model=model, additional_authorized_imports=["helium"], step_callbacks=[save_screenshot], diff --git a/tests/test_agents.py b/tests/test_agents.py index 826c2f2e4..0aca8543e 100644 --- a/tests/test_agents.py +++ b/tests/test_agents.py @@ -15,11 +15,10 @@ import io import os import tempfile -import unittest import uuid +from collections.abc import Generator from contextlib import nullcontext as does_not_raise from pathlib import Path -from typing import Any from unittest.mock import MagicMock, patch import pytest @@ -620,8 +619,8 @@ def __init__(self, name, tools, description="Mock agent description"): class DummyMultiStepAgent(MultiStepAgent): - def step(self, memory_step: ActionStep) -> None | Any: - return super().step(memory_step) + def step(self, memory_step: ActionStep) -> Generator[None]: + yield None def initialize_system_prompt(self): pass @@ -682,7 +681,7 @@ def test_step_number(self): fake_model.last_input_token_count = 10 fake_model.last_output_token_count = 20 max_steps = 2 - agent = DummyMultiStepAgent(tools=[], model=fake_model, max_steps=max_steps) + agent = CodeAgent(tools=[], model=fake_model, max_steps=max_steps) assert hasattr(agent, "step_number"), "step_number attribute should be defined" assert agent.step_number == 0, "step_number should be initialized to 0" agent.run("Test task") @@ -719,7 +718,8 @@ def test_planning_step(self, step, expected_messages_list): model=fake_model, ) task = "Test task" - planning_step = agent._generate_planning_step(task, is_first_step=(step == 1), step=step) + + planning_step = list(agent._generate_planning_step(task, is_first_step=(step == 1), step=step))[-1] expected_message_texts = { "INITIAL_PLAN_USER_PROMPT": populate_template( agent.prompt_templates["planning"]["initial_plan"], @@ -764,8 +764,8 @@ def test_planning_step(self, step, expected_messages_list): for content, expected_content in zip(message["content"], expected_message["content"]): assert content == expected_content # Test calls to model - assert len(fake_model.call_args_list) == 1 - for call_args, expected_messages in zip(fake_model.call_args_list, expected_messages_list): + assert len(fake_model.generate.call_args_list) == 1 + for call_args, expected_messages in zip(fake_model.generate.call_args_list, expected_messages_list): assert len(call_args.args) == 1 messages = call_args.args[0] assert isinstance(messages, list) @@ -962,7 +962,7 @@ def test_from_dict(self): assert agent.max_steps == 30 -class TestToolCallingAgent(unittest.TestCase): +class TestToolCallingAgent: @patch("huggingface_hub.InferenceClient") def test_toolcalling_agent_api(self, mock_inference_client): mock_client = mock_inference_client.return_value @@ -1039,6 +1039,57 @@ def test_toolcalling_agent_api_misformatted_output(self, mock_inference_client): assert "Error while parsing" in capture.get() assert len(agent.memory.steps) == 4 + def test_change_tools_after_init(self): + from smolagents import tool + + @tool + def fake_tool_1() -> str: + """Fake tool""" + return "1" + + @tool + def fake_tool_2() -> str: + """Fake tool""" + return "2" + + class FakeCodeModel(Model): + def generate(self, messages, tools_to_call_from=None, stop_sequences=None, grammar=None): + if len(messages) < 3: + return ChatMessage( + role="assistant", + content="", + tool_calls=[ + ChatMessageToolCall( + id="call_0", + type="function", + function=ChatMessageToolCallDefinition(name="fake_tool_1", arguments={}), + ) + ], + ) + else: + tool_result = messages[-1]["content"][0]["text"].removeprefix("Observation:\n") + return ChatMessage( + role="assistant", + content="", + tool_calls=[ + ChatMessageToolCall( + id="call_1", + type="function", + function=ChatMessageToolCallDefinition( + name="final_answer", arguments={"answer": tool_result} + ), + ) + ], + ) + + agent = ToolCallingAgent(tools=[fake_tool_1], model=FakeCodeModel()) + + agent.tools["final_answer"] = CustomFinalAnswerTool() + agent.tools["fake_tool_1"] = fake_tool_2 + + answer = agent.run("Fake task.") + assert answer == "2CUSTOM" + class TestCodeAgent: @pytest.mark.parametrize("provide_run_summary", [False, True]) diff --git a/tests/test_function_type_hints_utils.py b/tests/test_function_type_hints_utils.py index fdb55f200..13b279069 100644 --- a/tests/test_function_type_hints_utils.py +++ b/tests/test_function_type_hints_utils.py @@ -383,8 +383,8 @@ def test_union_types(self, union_types_func): return_prop = schema["function"]["return"] # Check union in parameter assert len(value_prop["type"]) == 2 - # Check union in return type - assert len(return_prop["type"]) == 2 + # Check union in return type: should be converted to "any" + assert return_prop["type"] == "any" def test_nested_types(self, nested_types_func): """Test schema generation for nested complex types.""" diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py index 41bbc8b8e..8d2f50c0a 100644 --- a/tests/test_monitoring.py +++ b/tests/test_monitoring.py @@ -137,7 +137,7 @@ def test_streaming_agent_text_output(self): # Use stream_to_gradio to capture the output outputs = list(stream_to_gradio(agent, task="Test task")) - self.assertEqual(len(outputs), 11) + self.assertEqual(len(outputs), 10) plan_message = outputs[1] self.assertEqual(plan_message.role, "assistant") self.assertIn("Code:", plan_message.content) @@ -161,7 +161,7 @@ def test_streaming_agent_image_output(self): ) ) - self.assertEqual(len(outputs), 6) + self.assertEqual(len(outputs), 5) final_message = outputs[-1] self.assertEqual(final_message.role, "assistant") self.assertIsInstance(final_message.content, dict) @@ -182,7 +182,7 @@ def generate(self, prompt, **kwargs): # Use stream_to_gradio to capture the output outputs = list(stream_to_gradio(agent, task="Test task")) - self.assertEqual(len(outputs), 13) + self.assertEqual(len(outputs), 11) final_message = outputs[-1] self.assertEqual(final_message.role, "assistant") self.assertIn("Malformed call", final_message.content) diff --git a/tests/test_remote_executors.py b/tests/test_remote_executors.py index f7fe05ed2..20f9fbae8 100644 --- a/tests/test_remote_executors.py +++ b/tests/test_remote_executors.py @@ -7,13 +7,33 @@ import pytest from rich.console import Console +from smolagents.default_tools import WikipediaSearchTool from smolagents.monitoring import AgentLogger, LogLevel -from smolagents.remote_executors import DockerExecutor, E2BExecutor +from smolagents.remote_executors import DockerExecutor, E2BExecutor, RemotePythonExecutor from smolagents.utils import AgentError from .utils.markers import require_run_all +class TestRemotePythonExecutor: + def test_send_tools_empty_tools(self): + executor = RemotePythonExecutor(additional_imports=[], logger=MagicMock()) + executor.run_code_raise_errors = MagicMock() + executor.send_tools({}) + assert executor.run_code_raise_errors.call_count == 1 + # No new packages should be installed + assert "!pip install" not in executor.run_code_raise_errors.call_args.args[0] + + @require_run_all + def test_send_tools_with_default_wikipedia_search_tool(self): + tool = WikipediaSearchTool() + executor = RemotePythonExecutor(additional_imports=[], logger=MagicMock()) + executor.run_code_raise_errors = MagicMock() + executor.send_tools({"wikipedia_search": tool}) + assert executor.run_code_raise_errors.call_count == 1 + assert "!pip install wikipedia-api" in executor.run_code_raise_errors.call_args.args[0] + + class TestE2BExecutorMock: def test_e2b_executor_instantiation(self): logger = MagicMock() diff --git a/tests/test_tools.py b/tests/test_tools.py index f82c08753..e9f842f6c 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -551,6 +551,20 @@ def test_function(items: list[str]) -> str: # Original function should not have 'self' parameter assert "self" not in original_signature.parameters + def test_tool_with_union_type_return(self): + @tool + def union_type_return_tool_function(param: int) -> str | bool: + """ + Tool with output union type. + + Args: + param: Input parameter. + """ + return str(param) if param > 0 else False + + assert isinstance(union_type_return_tool_function, Tool) + assert union_type_return_tool_function.output_type == "any" + @pytest.fixture def mock_server_parameters():