diff --git a/config/agent_gaia-validation-gpt5.yaml b/config/agent_gaia-validation-gpt5.yaml new file mode 100644 index 00000000..f0d78f18 --- /dev/null +++ b/config/agent_gaia-validation-gpt5.yaml @@ -0,0 +1,79 @@ +defaults: + - benchmark: gaia-validation + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "GPT5OpenAIClient" + model_name: "gpt-5" + async_client: true + temperature: 1.0 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 128000 + reasoning_effort: "high" + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" + openrouter_provider: "" + disable_cache_control: true + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: -1 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: true + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "GPT5OpenAIClient" + model_name: "gpt-5" + async_client: true + temperature: 1.0 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 128000 + reasoning_effort: "medium" + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" + openrouter_provider: "" + disable_cache_control: true + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: -1 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + diff --git a/config/tool/tool-reading.yaml b/config/tool/tool-reading.yaml index 78fcb0e1..5038c4c6 100644 --- a/config/tool/tool-reading.yaml +++ b/config/tool/tool-reading.yaml @@ -2,4 +2,7 @@ name: "tool-reading" tool_command: "python" args: - "-m" - - "src.tool.mcp_servers.reading_mcp_server" \ No newline at end of file + - "src.tool.mcp_servers.reading_mcp_server" +env: + SERPER_API_KEY: "${oc.env:SERPER_API_KEY}" + JINA_API_KEY: "${oc.env:JINA_API_KEY}" \ No newline at end of file diff --git a/docs/mkdocs/docs/gaia_validation_gpt5.md b/docs/mkdocs/docs/gaia_validation_gpt5.md new file mode 100644 index 00000000..1607e477 --- /dev/null +++ b/docs/mkdocs/docs/gaia_validation_gpt5.md @@ -0,0 +1,56 @@ +# GAIA Validation - GPT5 + +MiroFlow now supports GPT-5 with MCP tool invocation, providing a unified workflow for multi-step reasoning, information integration, and scalable tool coordination. + +!!! info "Prerequisites" + Before proceeding, please review the [GAIA Validation Prerequisites](gaia_validation_prerequisites.md) document, which covers common setup requirements, dataset preparation, and API key configuration. + +--- + +## Running the Evaluation + +### Step 1: Dataset Preparation + +Follow the [dataset preparation instructions](gaia_validation_prerequisites.md#dataset-preparation) in the prerequisites document. + +### Step 2: API Keys Configuration + +Configure the following API keys in your `.env` file: + +```env title="GPT-5 .env Configuration" +# Search and web scraping capabilities +SERPER_API_KEY="your-serper-api-key" +JINA_API_KEY="your-jina-api-key" + +# Code execution environment +E2B_API_KEY="your-e2b-api-key" + +# Vision understanding capabilities +ANTHROPIC_API_KEY="your-anthropic-api-key" +GEMINI_API_KEY="your-gemini-api-key" + +# Primary LLM provider, LLM judge, reasoning, and hint generation +OPENAI_API_KEY="your-openai-api-key" +OPENAI_BASE_URL="https://api.openai.com/v1" + +``` + +### Step 3: Run the Evaluation + +Execute the evaluation using the GPT-5 configuration: + +```bash title="Run GAIA Validation with GPT-5" +uv run main.py common-benchmark \ + --config_file_name=agent_gaia-validation-gpt5 \ + output_dir="logs/gaia-validation-gpt5/$(date +"%Y%m%d_%H%M")" +``` + +### Step 4: Monitor Progress + +Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#progress-monitoring-and-resume) in the prerequisites document. + + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file diff --git a/docs/mkdocs/docs/llm_clients_overview.md b/docs/mkdocs/docs/llm_clients_overview.md index b9894c58..08f3064d 100644 --- a/docs/mkdocs/docs/llm_clients_overview.md +++ b/docs/mkdocs/docs/llm_clients_overview.md @@ -9,6 +9,7 @@ MiroFlow supports multiple LLM providers through a unified client interface. Eac | `ClaudeAnthropicClient` | Anthropic Direct | claude-3-7-sonnet | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | | `ClaudeOpenRouterClient` | OpenRouter | anthropic/claude-3.7-sonnet, and other [supported models](https://openrouter.ai/models) | `OPENROUTER_API_KEY`, `OPENROUTER_BASE_URL` | | `GPTOpenAIClient` | OpenAI | gpt-4, gpt-3.5 | `OPENAI_API_KEY`, `OPENAI_BASE_URL` | +| `GPT5OpenAIClient` | OpenAI | gpt-5 | `OPENAI_API_KEY`, `OPENAI_BASE_URL` | | `MiroThinkerSGLangClient` | SGLang | MiroThinker series | `OAI_MIROTHINKER_API_KEY`, `OAI_MIROTHINKER_BASE_URL` | ## Basic Configuration @@ -31,4 +32,4 @@ main_agent: --- !!! info "Documentation Info" - **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file + **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file diff --git a/docs/mkdocs/docs/openai-gpt.md b/docs/mkdocs/docs/openai-gpt.md index c3636913..5e181271 100644 --- a/docs/mkdocs/docs/openai-gpt.md +++ b/docs/mkdocs/docs/openai-gpt.md @@ -1,8 +1,45 @@ # OpenAI GPT Models -OpenAI's latest models including GPT-4o and advanced reasoning models with strong coding, vision, and reasoning capabilities. +OpenAI's latest models including GPT-5, GPT-4o and advanced reasoning models with strong coding, vision, and reasoning capabilities. -## Client Used +## Client Used for GPT-5 + +`GPT5OpenAIClient` + +## Environment Setup + +```bash title="Environment Variables" +export OPENAI_API_KEY="your-openai-key" +export OPENAI_BASE_URL="https://api.openai.com/v1" # optional +``` + +## Configuration + +```yaml title="Agent Configuration" +main_agent: + llm: + provider_class: "GPT5OpenAIClient" + model_name: "gpt-5" + async_client: true + temperature: 1.0 + top_p: 1.0 + min_p: 0.0 + top_k: -1 + max_tokens: 128000 + reasoning_effort: "high" # Use high in the main agent, and use the default medium in the sub-agent. + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}" +``` + +## Usage + +```bash title="Example Command" +# Create custom OpenAI config +uv run main.py trace --config_file_name=your_config_file \ + --task="Your task" --task_file_name="data/file.txt" +``` + +## Client Used for GPT-4o `GPTOpenAIClient` @@ -32,7 +69,10 @@ uv run main.py trace --config_file_name=your_config_file \ --task="Your task" --task_file_name="data/file.txt" ``` +!!! note "Configuration Notes" + - `GPTOpenAIClient` also supports GPT-5, but it has not been fully validated on MiroFlow yet. We recommend using `GPT5OpenAIClient`. + --- !!! info "Documentation Info" - **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file + **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index a6db0940..7f9f7840 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -52,6 +52,7 @@ nav: - GAIA-Validation: - Prerequisites: gaia_validation_prerequisites.md - Claude-3.7-Sonnet: gaia_validation_claude37sonnet.md + - GPT-5: gaia_validation_gpt5.md - MiroThinker: gaia_validation_mirothinker.md - GAIA-Validation-Text-Only: gaia_validation_text_only.md - GAIA-Test: gaia_test.md diff --git a/src/llm/provider_client_base.py b/src/llm/provider_client_base.py index 5d07fcec..9333054e 100644 --- a/src/llm/provider_client_base.py +++ b/src/llm/provider_client_base.py @@ -43,6 +43,7 @@ def __post_init__(self): self.top_p: float = self.cfg.llm.top_p self.min_p: float = self.cfg.llm.min_p self.top_k: int = self.cfg.llm.top_k + self.reasoning_effort: str = self.cfg.llm.get("reasoning_effort", "medium") self.repetition_penalty: float = self.cfg.llm.get("repetition_penalty", 1.0) self.max_tokens: int = self.cfg.llm.max_tokens self.max_context_length: int = self.cfg.llm.get("max_context_length", -1) diff --git a/src/llm/providers/claude_openrouter_client.py b/src/llm/providers/claude_openrouter_client.py index b95fd031..54d1eff3 100644 --- a/src/llm/providers/claude_openrouter_client.py +++ b/src/llm/providers/claude_openrouter_client.py @@ -411,4 +411,4 @@ def _apply_cache_control(self, messages): else: # Other messages add directly cached_messages.append(turn) - return list(reversed(cached_messages)) + return list(reversed(cached_messages)) \ No newline at end of file diff --git a/src/llm/providers/gpt5_openai_client.py b/src/llm/providers/gpt5_openai_client.py new file mode 100644 index 00000000..a71c9014 --- /dev/null +++ b/src/llm/providers/gpt5_openai_client.py @@ -0,0 +1,412 @@ +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import dataclasses +import json +import os +import re +from typing import Any, Dict, List + +import tiktoken +from omegaconf import DictConfig +from openai import AsyncOpenAI, OpenAI +from tenacity import ( + retry, + retry_if_not_exception_type, + stop_after_attempt, + wait_exponential, +) + +from src.llm.provider_client_base import LLMProviderClientBase + +from src.logging.logger import bootstrap_logger + +LOGGER_LEVEL = os.getenv("LOGGER_LEVEL", "INFO") +logger = bootstrap_logger(level=LOGGER_LEVEL) + + +class ContextLimitError(Exception): + pass + + +@dataclasses.dataclass +class GPT5OpenAIClient(LLMProviderClientBase): + def _create_client(self, config: DictConfig): + """Create configured OpenAI client""" + if self.async_client: + return AsyncOpenAI( + api_key=self.cfg.llm.openai_api_key, + base_url=self.cfg.llm.openai_base_url, + timeout=1800, + ) + else: + return OpenAI( + api_key=self.cfg.llm.openai_api_key, + base_url=self.cfg.llm.openai_base_url, + timeout=1800, + ) + + @retry( + wait=wait_exponential(multiplier=5), + stop=stop_after_attempt(5), + retry=retry_if_not_exception_type(ContextLimitError), + ) + async def _create_message( + self, + system_prompt: str, + messages: List[Dict[str, Any]], + tools_definitions, + keep_tool_result: int = -1, + ): + """ + Send message to OpenAI API. + :param system_prompt: System prompt string. + :param messages: Message history list. + :return: OpenAI API response object or None (if error). + """ + logger.debug(f" Calling LLM ({'async' if self.async_client else 'sync'})") + # put the system prompt in the first message since OpenAI API does not support system prompt in + if system_prompt: + target_role = "system" + + # Check if there are already system or developer messages + if messages and messages[0]["role"] in ["system", "developer"]: + # Replace existing message with correct role + messages[0] = { + "role": target_role, + "content": [dict(type="text", text=system_prompt)], + } + else: + # Insert new message + messages.insert( + 0, + { + "role": target_role, + "content": [dict(type="text", text=system_prompt)], + }, + ) + + messages_copy = self._remove_tool_result_from_messages( + messages, keep_tool_result + ) + + # Apply cache control + if self.disable_cache_control: + processed_messages = messages_copy + else: + processed_messages = self._apply_cache_control(messages_copy) + + params = None + try: + temperature = self.temperature + + # build extra_body if self.openrouter_provider + provider_config = (self.openrouter_provider or "").strip().lower() + logger.info(f"provider_config: {provider_config}") + if provider_config == "google": + extra_body = { + "provider": { + "only": [ + "google-vertex/us", + "google-vertex/europe", + "google-vertex/global", + ] + } + } + elif provider_config == "anthropic": + extra_body = {"provider": {"only": ["anthropic"]}} + # extra_body["provider"]["ignore"] = ["google-vertex/us", "google-vertex/europe", "google-vertex/global"] + elif provider_config == "amazon": + extra_body = {"provider": {"only": ["amazon-bedrock"]}} + elif provider_config != "": + extra_body = {"provider": {"only": [provider_config]}} + else: + extra_body = {} + + # Add top_k and min_p through extra_body for OpenRouter + if self.top_k != -1: + extra_body["top_k"] = self.top_k + if self.min_p != 0.0: + extra_body["min_p"] = self.min_p + if self.repetition_penalty != 1.0: + extra_body["repetition_penalty"] = self.repetition_penalty + + assert self.model_name in ["gpt-5-2025-08-07", "gpt-5"] + params = { + "model": self.model_name, + "temperature": temperature, + "max_completion_tokens": self.max_tokens, + "messages": processed_messages, + "stream": False, + "extra_body": extra_body, + "reasoning_effort": self.reasoning_effort, + } + + # Add optional parameters only if they have non-default values + if self.top_p != 1.0: + params["top_p"] = self.top_p + + response = await self._create_completion(params, self.async_client) + + if ( + response is None + or response.choices is None + or len(response.choices) == 0 + ): + logger.debug(f"LLM call failed: response = {response}") + raise Exception(f"LLM call failed [rare case]: response = {response}") + + if response.choices and response.choices[0].finish_reason == "length": + logger.debug( + "LLM finish_reason is 'length', triggering ContextLimitError" + ) + raise ContextLimitError( + "(finish_reason=length) Response truncated due to maximum context length" + ) + + if ( + response.choices + and response.choices[0].finish_reason == "stop" + and response.choices[0].message.content.strip() == "" + ): + logger.debug( + "LLM finish_reason is 'stop', but content is empty, triggering Error" + ) + raise Exception("LLM finish_reason is 'stop', but content is empty") + + logger.debug( + f"LLM call finish_reason: {getattr(response.choices[0], 'finish_reason', 'N/A')}" + ) + return response + except asyncio.CancelledError: + logger.debug("[WARNING] LLM API call was cancelled during execution") + raise Exception("LLM API call was cancelled during execution") + except Exception as e: + error_str = str(e) + if ( + "Input is too long for requested model" in error_str + or "input length and `max_tokens` exceed context limit" in error_str + or "maximum context length" in error_str + or "prompt is too long" in error_str + or "exceeds the maximum length" in error_str + or "exceeds the maximum allowed length" in error_str + or "Input tokens exceed the configured limit" in error_str + ): + logger.debug(f"OpenRouter LLM Context limit exceeded: {error_str}") + raise ContextLimitError(f"Context limit exceeded: {error_str}") + + logger.error( + f"OpenRouter LLM call failed: {str(e)}, input = {json.dumps(params)}", + exc_info=True, + ) + raise e + + async def _create_completion(self, params: Dict[str, Any], is_async: bool): + """Helper to create a completion, handling async and sync calls.""" + if is_async: + return await self.client.chat.completions.create(**params) + else: + return self.client.chat.completions.create(**params) + + def _clean_user_content_from_response(self, text: str) -> str: + """Remove content between \\n\\nUser: and in assistant response (if no , remove to end)""" + # Match content between \n\nUser: and , if no delete to text end + pattern = r"\n\nUser:.*?(?=|$)" + cleaned_text = re.sub(pattern, "", text, flags=re.MULTILINE | re.DOTALL) + + return cleaned_text + + def process_llm_response( + self, llm_response, message_history, agent_type="main" + ) -> tuple[str, bool]: + """Process OpenAI LLM response""" + + if not llm_response or not llm_response.choices: + error_msg = "LLM did not return a valid response." + logger.error(f"Should never happen: {error_msg}") + return "", True # Exit loop + + # Extract LLM response text + if llm_response.choices[0].finish_reason == "stop": + assistant_response_text = llm_response.choices[0].message.content or "" + # remove user: {...} content + assistant_response_text = self._clean_user_content_from_response( + assistant_response_text + ) + message_history.append( + {"role": "assistant", "content": assistant_response_text} + ) + elif llm_response.choices[0].finish_reason == "length": + assistant_response_text = llm_response.choices[0].message.content or "" + if assistant_response_text == "": + assistant_response_text = "LLM response is empty. This is likely due to thinking block used up all tokens." + else: + assistant_response_text = self._clean_user_content_from_response( + assistant_response_text + ) + message_history.append( + {"role": "assistant", "content": assistant_response_text} + ) + else: + logger.error( + f"Unsupported finish reason: {llm_response.choices[0].finish_reason}" + ) + assistant_response_text = ( + "Successful response, but unsupported finish reason: " + + llm_response.choices[0].finish_reason + ) + message_history.append( + {"role": "assistant", "content": assistant_response_text} + ) + logger.debug(f"LLM Response: {assistant_response_text}") + + return assistant_response_text, False + + def extract_tool_calls_info(self, llm_response, assistant_response_text): + """Extract tool call information from OpenAI LLM response""" + from src.utils.parsing_utils import parse_llm_response_for_tool_calls + + # For Anthropic, parse tool calls from response text + return parse_llm_response_for_tool_calls(assistant_response_text) + + def update_message_history( + self, message_history, tool_call_info, tool_calls_exceeded=False + ): + """Update message history with tool calls data (llm client specific)""" + + # Filter tool call results with type "text" + tool_call_info = [item for item in tool_call_info if item[1]["type"] == "text"] + + # Separate valid tool calls and bad tool calls + valid_tool_calls = [ + (tool_id, content) + for tool_id, content in tool_call_info + if tool_id != "FAILED" + ] + bad_tool_calls = [ + (tool_id, content) + for tool_id, content in tool_call_info + if tool_id == "FAILED" + ] + + total_calls = len(valid_tool_calls) + len(bad_tool_calls) + + # Build output text + output_parts = [] + + if total_calls > 1: + # Handling for multiple tool calls + # Add tool result description + if tool_calls_exceeded: + output_parts.append( + f"You made too many tool calls. I can only afford to process {len(valid_tool_calls)} valid tool calls in this turn." + ) + else: + output_parts.append( + f"I have processed {len(valid_tool_calls)} valid tool calls in this turn." + ) + + # Output each valid tool call result according to format + for i, (tool_id, content) in enumerate(valid_tool_calls, 1): + output_parts.append(f"Valid tool call {i} result:\n{content['text']}") + + # Output bad tool calls results + for i, (tool_id, content) in enumerate(bad_tool_calls, 1): + output_parts.append(f"Failed tool call {i} result:\n{content['text']}") + else: + # For single tool call, output result directly + for tool_id, content in valid_tool_calls: + output_parts.append(content["text"]) + for tool_id, content in bad_tool_calls: + output_parts.append(content["text"]) + + merged_text = "\n\n".join(output_parts) + + message_history.append( + { + "role": "user", + "content": [{"type": "text", "text": merged_text}], + } + ) + return message_history + + def parse_llm_response(self, llm_response) -> str: + """Parse OpenAI LLM response to get text content""" + if not llm_response or not llm_response.choices: + raise ValueError("LLM did not return a valid response.") + return llm_response.choices[0].message.content + + def _estimate_tokens(self, text: str) -> int: + """Use tiktoken to estimate token count of text""" + if not hasattr(self, "encoding"): + # Initialize tiktoken encoder + try: + self.encoding = tiktoken.get_encoding("o200k_base") + except Exception: + # If o200k_base is not available, use cl100k_base as fallback + self.encoding = tiktoken.get_encoding("cl100k_base") + + try: + return len(self.encoding.encode(text)) + except Exception: + # If encoding fails, use simple estimation: about 1 token per 4 characters + return len(text) // 4 + + def handle_max_turns_reached_summary_prompt(self, message_history, summary_prompt): + """Handle max turns reached summary prompt""" + if message_history[-1]["role"] == "user": + last_user_message = message_history.pop() + return ( + last_user_message["content"][0]["text"] + + "\n\n-----------------\n\n" + + summary_prompt + ) + else: + return summary_prompt + + def _apply_cache_control(self, messages): + """Apply cache control to the last user message and system message (if applicable)""" + cached_messages = [] + user_turns_processed = 0 + for turn in reversed(messages): + if (turn["role"] == "user" and user_turns_processed < 1) or ( + turn["role"] == "system" + ): + # Add ephemeral cache control to the text part of the last user message + new_content = [] + processed_text = False + # Check if content is a list + if isinstance(turn.get("content"), list): + # see example here + # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching + for item in turn["content"]: + if ( + item.get("type") == "text" + and len(item.get("text")) > 0 + and not processed_text + ): + # Copy and add cache control + text_item = item.copy() + text_item["cache_control"] = {"type": "ephemeral"} + new_content.append(text_item) + processed_text = True + else: + # Other types of content (like image) copy directly + new_content.append(item.copy()) + cached_messages.append( + {"role": turn["role"], "content": new_content} + ) + else: + # If content is not a list (e.g., plain text), add as is without cache control + # Or adjust logic as needed + logger.debug( + "Warning: User message content is not in expected list format, cache control not applied." + ) + cached_messages.append(turn) + user_turns_processed += 1 + else: + # Other messages add directly + cached_messages.append(turn) + return list(reversed(cached_messages)) diff --git a/src/llm/providers/gpt_openai_client.py b/src/llm/providers/gpt_openai_client.py index b404e5f2..46d3d325 100644 --- a/src/llm/providers/gpt_openai_client.py +++ b/src/llm/providers/gpt_openai_client.py @@ -18,7 +18,7 @@ LOGGER_LEVEL = os.getenv("LOGGER_LEVEL", "INFO") # OPENAI reasoning models only support temperature=1 -OPENAI_REASONING_MODEL_SET = set(["o1", "o3", "o3-mini", "o4-mini"]) +OPENAI_REASONING_MODEL_SET = set(["o1", "o3", "o3-mini", "o4-mini", "gpt-5", "gpt-5-2025-08-07"]) logger = bootstrap_logger(level=LOGGER_LEVEL) @@ -29,13 +29,15 @@ def _create_client(self, config: DictConfig): """Create configured OpenAI client""" if self.async_client: return AsyncOpenAI( - api_key=config.env.openai_api_key, - base_url=config.env.openai_base_url, + api_key=self.cfg.llm.openai_api_key, + base_url=self.cfg.llm.openai_base_url, + timeout=1800, ) else: return OpenAI( - api_key=config.env.openai_api_key, - base_url=config.env.openai_base_url, + api_key=self.cfg.llm.openai_api_key, + base_url=self.cfg.llm.openai_base_url, + timeout=1800, ) @retry(wait=wait_fixed(10), stop=stop_after_attempt(5)) @@ -58,6 +60,7 @@ async def _create_message( or self.model_name.startswith("o4") or self.model_name.startswith("gpt-4.1") or self.model_name.startswith("gpt-4o") + or self.model_name.startswith("gpt-5") ) logger.debug(f" Calling LLM ({'async' if self.async_client else 'sync'})") # put the system prompt in the first message since OpenAI API does not support system prompt in @@ -88,21 +91,28 @@ async def _create_message( tool_list = await self.convert_tool_definition_to_tool_call(tools_definitions) try: - # Set temperature=1 for reasoning models - temperature = ( - 1.0 - if self.model_name in OPENAI_REASONING_MODEL_SET - else self.temperature - ) - - params = { - "model": self.model_name, - "temperature": temperature, - "max_completion_tokens": self.max_tokens, - "messages": messages_copy, - "tools": tool_list, - "stream": False, - } + # Set temperature and reasoning_effort for reasoning models + if self.model_name in OPENAI_REASONING_MODEL_SET: + temperature = 1.0 + params = { + "model": self.model_name, + "temperature": temperature, + "max_completion_tokens": self.max_tokens, + "messages": messages_copy, + "reasoning_effort": self.reasoning_effort, + "tools": tool_list, + "stream": False, + } + else: + temperature = self.temperature + params = { + "model": self.model_name, + "temperature": temperature, + "max_completion_tokens": self.max_tokens, + "messages": messages_copy, + "tools": tool_list, + "stream": False, + } if self.top_p != 1.0: params["top_p"] = self.top_p diff --git a/src/tool/mcp_servers/vision_mcp_server.py b/src/tool/mcp_servers/vision_mcp_server.py index 8b816f60..24e1c775 100755 --- a/src/tool/mcp_servers/vision_mcp_server.py +++ b/src/tool/mcp_servers/vision_mcp_server.py @@ -87,6 +87,10 @@ async def call_claude_vision(image_path_or_url: str, question: str) -> str: ] try: + from urllib.parse import urlparse, unquote + parsed = urlparse(image_path_or_url) + if parsed.scheme == "file": + image_path_or_url = unquote(parsed.path) if os.path.exists(image_path_or_url): # Check if the file exists locally with open(image_path_or_url, "rb") as image_file: image_data = base64.b64encode(image_file.read()).decode("utf-8")