diff --git a/config/config_loader.py b/config/config_loader.py index 6feb04382..219522ff0 100644 --- a/config/config_loader.py +++ b/config/config_loader.py @@ -45,6 +45,7 @@ import logging import os +import re from pathlib import Path from typing import Any, Dict, List, Optional @@ -225,6 +226,7 @@ def _load_yaml(self, path: Path) -> Optional[Dict[str, Any]]: try: with open(path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) or {} + data = self._expand_env_vars(data) self._cache[cache_key] = data return data except Exception as e: @@ -251,6 +253,29 @@ def _deep_merge(self, target: Dict[str, Any], source: Dict[str, Any]) -> None: else: target[key] = value + def _expand_env_vars(self, value: Any) -> Any: + """ + Expand ${VAR} and $VAR placeholders in YAML values using environment variables. + + Only string values are expanded; all other types are returned as-is. + Unset variables are left untouched. + """ + if isinstance(value, dict): + return {k: self._expand_env_vars(v) for k, v in value.items()} + if isinstance(value, list): + return [self._expand_env_vars(v) for v in value] + if isinstance(value, str): + # Expand ${VAR} and $VAR while leaving unknown variables intact. + def replacer(match: re.Match[str]) -> str: + var_name = match.group(1) or match.group(2) + if not var_name: + return match.group(0) + env_val = os.getenv(var_name) + return env_val if env_val is not None else match.group(0) + + return re.sub(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}|\$([A-Za-z_][A-Za-z0-9_]*)", replacer, value) + return value + def _discover_yaml_files(self, directory: Path) -> List[Path]: """ Discover all YAML files in a directory. @@ -520,11 +545,12 @@ def _update_api_base(config: Dict[str, Any], agent_key: str) -> None: return api_type = agent_config.get("API_TYPE", "").lower() + use_responses = bool(agent_config.get("USE_RESPONSES", False)) if api_type == "aoai": # Azure OpenAI - construct deployment URL api_base = agent_config.get("API_BASE", "") - if api_base and "deployments" not in api_base: + if api_base and "deployments" not in api_base and not use_responses: deployment_id = agent_config.get("API_DEPLOYMENT_ID", "") api_version = agent_config.get("API_VERSION", "") if deployment_id: diff --git a/config/config_schemas.py b/config/config_schemas.py index 200473747..db381f572 100644 --- a/config/config_schemas.py +++ b/config/config_schemas.py @@ -89,6 +89,31 @@ def get(self, key: str, default: Any = None) -> Any: except KeyError: return default + def to_dict(self) -> Dict[str, Any]: + """ + Convert AgentConfig to dictionary with uppercase keys plus extras. + """ + data = { + "VISUAL_MODE": self.visual_mode, + "REASONING_MODEL": self.reasoning_model, + "API_TYPE": self.api_type, + "API_BASE": self.api_base, + "API_KEY": self.api_key, + "API_VERSION": self.api_version, + "API_MODEL": self.api_model, + "AAD_TENANT_ID": self.aad_tenant_id, + "AAD_API_SCOPE": self.aad_api_scope, + "AAD_API_SCOPE_BASE": self.aad_api_scope_base, + "API_DEPLOYMENT_ID": self.api_deployment_id, + "PROMPT": self.prompt, + "EXAMPLE_PROMPT": self.example_prompt, + } + # Merge extras (do not overwrite fixed fields if already set) + for key, value in self._extras.items(): + if key not in data: + data[key] = value + return data + @classmethod def from_dict(cls, data: Dict[str, Any]) -> "AgentConfig": """ diff --git a/config/ufo/agents.yaml.template b/config/ufo/agents.yaml.template index dc48414ad..f8edae804 100644 --- a/config/ufo/agents.yaml.template +++ b/config/ufo/agents.yaml.template @@ -18,6 +18,7 @@ HOST_AGENT: # API_VERSION: "2024-02-15-preview" # API_MODEL: "gpt-4o" # API_DEPLOYMENT_ID: "YOUR_DEPLOYMENT_ID" # The deployment id for the AOAI API + # USE_RESPONSES: True # Use Responses API instead of Chat Completions ### For Azure AD authentication (azure_ad) # API_TYPE: "azure_ad" @@ -45,6 +46,7 @@ APP_AGENT: # API_VERSION: "2024-02-15-preview" # API_MODEL: "gpt-4o" # API_DEPLOYMENT_ID: "YOUR_DEPLOYMENT_ID" + # USE_RESPONSES: True # Use Responses API instead of Chat Completions ### For Azure AD authentication (azure_ad) # API_TYPE: "azure_ad" @@ -72,6 +74,7 @@ BACKUP_AGENT: # API_VERSION: "2024-02-15-preview" # API_MODEL: "gpt-4-vision-preview" # API_DEPLOYMENT_ID: "gpt-4-visual-preview" + # USE_RESPONSES: True # Use Responses API instead of Chat Completions ### For Azure AD authentication (azure_ad) # API_TYPE: "azure_ad" @@ -95,6 +98,7 @@ EVALUATION_AGENT: # API_VERSION: "2024-02-15-preview" # API_MODEL: "gpt-4o" # API_DEPLOYMENT_ID: "YOUR_DEPLOYMENT_ID" + # USE_RESPONSES: True # Use Responses API instead of Chat Completions ### For Azure AD authentication (azure_ad) # API_TYPE: "azure_ad" diff --git a/ufo/agents/presenters/rich_presenter.py b/ufo/agents/presenters/rich_presenter.py index b6529f2ba..c596bcc57 100644 --- a/ufo/agents/presenters/rich_presenter.py +++ b/ufo/agents/presenters/rich_presenter.py @@ -88,6 +88,15 @@ def __init__(self, console: Optional[Console] = None): """ self.console = console or Console() + def _safe_text(self, text: str) -> str: + """ + Avoid UnicodeEncodeError on legacy Windows consoles by stripping non-ASCII. + """ + encoding = (self.console.encoding or "").lower() + if "utf" in encoding: + return text + return text.encode("ascii", "ignore").decode("ascii") + def present_response(self, response: Any, **kwargs) -> None: """ Present the complete agent response. @@ -109,8 +118,8 @@ def present_thought(self, thought: str) -> None: if thought: self.console.print( Panel( - thought, - title=self.STYLES["thought"]["title"], + self._safe_text(thought), + title=self._safe_text(self.STYLES["thought"]["title"]), style=self.STYLES["thought"]["style"], ) ) @@ -124,8 +133,8 @@ def present_observation(self, observation: str) -> None: if observation: self.console.print( Panel( - observation, - title=self.STYLES["observation"]["title"], + self._safe_text(observation), + title=self._safe_text(self.STYLES["observation"]["title"]), style=self.STYLES["observation"]["style"], ) ) @@ -154,8 +163,8 @@ def present_status(self, status: str, **kwargs) -> None: self.console.print( Panel( - status_upper, - title=title, + self._safe_text(status_upper), + title=self._safe_text(title), style=style, ) ) @@ -180,8 +189,8 @@ def present_plan(self, plan: List[str]) -> None: plan_str = "\n".join(plan) if isinstance(plan, list) else str(plan) self.console.print( Panel( - plan_str, - title=self.STYLES["next_plan"]["title"], + self._safe_text(plan_str), + title=self._safe_text(self.STYLES["next_plan"]["title"]), style=self.STYLES["next_plan"]["style"], ) ) @@ -208,8 +217,8 @@ def present_results(self, results: Any) -> None: self.console.print( Panel( - results_content, - title=self.STYLES["results"]["title"], + self._safe_text(results_content), + title=self._safe_text(self.STYLES["results"]["title"]), style=self.STYLES["results"]["style"], ) ) @@ -226,12 +235,17 @@ def _print_response_header(self, agent_type: str) -> None: """ from rich.rule import Rule + start_char = self.STYLES["separator"]["start"]["char"] + encoding = (self.console.encoding or "").lower() + if "utf" not in encoding: + start_char = "=" + self.console.print() self.console.print( Rule( - f"πŸ€– {agent_type} Response", + self._safe_text(f"πŸ€– {agent_type} Response"), style=self.STYLES["separator"]["start"]["style"], - characters=self.STYLES["separator"]["start"]["char"], + characters=start_char, ) ) @@ -241,10 +255,15 @@ def _print_response_footer(self) -> None: """ from rich.rule import Rule + end_char = self.STYLES["separator"]["end"]["char"] + encoding = (self.console.encoding or "").lower() + if "utf" not in encoding: + end_char = "-" + self.console.print( Rule( style=self.STYLES["separator"]["end"]["style"], - characters=self.STYLES["separator"]["end"]["char"], + characters=end_char, ) ) self.console.print() @@ -299,8 +318,10 @@ def present_app_agent_response( reason = screenshot_saving.get("reason") self.console.print( Panel( - f"πŸ“Έ Screenshot saved to the blackboard.\nReason: {reason}", - title=self.STYLES["notice"]["title"], + self._safe_text( + f"πŸ“Έ Screenshot saved to the blackboard.\nReason: {reason}" + ), + title=self._safe_text(self.STYLES["notice"]["title"]), style=self.STYLES["notice"]["style"], ) ) @@ -386,8 +407,8 @@ def present_host_agent_response( self.console.print( Panel( - action_str, - title=self.STYLES["action_applied"]["title"], + self._safe_text(action_str), + title=self._safe_text(self.STYLES["action_applied"]["title"]), style=self.STYLES["action_applied"]["style"], ) ) @@ -395,8 +416,8 @@ def present_host_agent_response( # Plan self.console.print( Panel( - plan_str, - title=self.STYLES["plan"]["title"], + self._safe_text(plan_str), + title=self._safe_text(self.STYLES["plan"]["title"]), style=self.STYLES["plan"]["style"], ) ) @@ -405,8 +426,8 @@ def present_host_agent_response( if application: self.console.print( Panel( - application, - title=self.STYLES["next_application"]["title"], + self._safe_text(application), + title=self._safe_text(self.STYLES["next_application"]["title"]), style=self.STYLES["next_application"]["style"], ) ) @@ -415,8 +436,8 @@ def present_host_agent_response( if message: self.console.print( Panel( - message, - title=self.STYLES["message"]["title"], + self._safe_text(message), + title=self._safe_text(self.STYLES["message"]["title"]), style=self.STYLES["message"]["style"], ) ) @@ -424,8 +445,8 @@ def present_host_agent_response( # Status self.console.print( Panel( - status, - title=self.STYLES["status_default"]["title"], + self._safe_text(status), + title=self._safe_text(self.STYLES["status_default"]["title"]), style=self.STYLES["status_default"]["style"], ) ) @@ -472,8 +493,8 @@ def present_constellation_agent_response( if response.thought: self.console.print( Panel( - response.thought, - title="🧠 Constellation Agent Thoughts", + self._safe_text(response.thought), + title=self._safe_text("🧠 Constellation Agent Thoughts"), style="green", ) ) @@ -493,8 +514,8 @@ def present_constellation_agent_response( self.console.print( Panel( - response.status.upper(), - title=f"{status_emoji} Processing Status", + self._safe_text(response.status.upper()), + title=self._safe_text(f"{status_emoji} Processing Status"), style=status_style, ) ) @@ -515,7 +536,7 @@ def present_constellation_agent_response( self.console.print( Panel( actions_text, - title="βš’οΈ Planned Actions", + title=self._safe_text("βš’οΈ Planned Actions"), style="blue", ) ) @@ -541,21 +562,36 @@ def _present_constellation_info(self, constellation: Any) -> None: constellation_state = constellation.state constellation_info = Text() - constellation_info.append(f"πŸ†” ID: ", style="bold cyan") + if "utf" in (self.console.encoding or "").lower(): + constellation_info.append(f"πŸ†” ID: ", style="bold cyan") + else: + constellation_info.append("ID: ", style="bold cyan") constellation_info.append(f"{constellation.constellation_id}\n", style="white") - constellation_info.append(f"🌟 Name: ", style="bold cyan") + if "utf" in (self.console.encoding or "").lower(): + constellation_info.append("🌟 Name: ", style="bold cyan") + else: + constellation_info.append("Name: ", style="bold cyan") constellation_info.append(f"{constellation_name}\n", style="white") - constellation_info.append(f"πŸ“Š State: ", style="bold cyan") + if "utf" in (self.console.encoding or "").lower(): + constellation_info.append("πŸ“Š State: ", style="bold cyan") + else: + constellation_info.append("State: ", style="bold cyan") constellation_info.append(f"{constellation_state}\n", style="white") - constellation_info.append(f"πŸ“‹ Tasks: ", style="bold cyan") + if "utf" in (self.console.encoding or "").lower(): + constellation_info.append("πŸ“‹ Tasks: ", style="bold cyan") + else: + constellation_info.append("Tasks: ", style="bold cyan") constellation_info.append(f"{task_count}\n", style="white") - constellation_info.append(f"πŸ”— Dependencies: ", style="bold cyan") + if "utf" in (self.console.encoding or "").lower(): + constellation_info.append("πŸ”— Dependencies: ", style="bold cyan") + else: + constellation_info.append("Dependencies: ", style="bold cyan") constellation_info.append(f"{dependency_count}", style="white") self.console.print( Panel( constellation_info, - title=self.STYLES["constellation_info"]["title"], + title=self._safe_text(self.STYLES["constellation_info"]["title"]), style=self.STYLES["constellation_info"]["style"], ) ) @@ -578,12 +614,15 @@ def _present_constellation_info(self, constellation: Any) -> None: # Show tips if available if task.tips: for tip in task.tips: - tasks_text.append(f" πŸ’‘ {tip}\n", style="green") + if "utf" in (self.console.encoding or "").lower(): + tasks_text.append(f" πŸ’‘ {tip}\n", style="green") + else: + tasks_text.append(f" Tip: {tip}\n", style="green") self.console.print( Panel( tasks_text, - title=self.STYLES["task_details"]["title"], + title=self._safe_text(self.STYLES["task_details"]["title"]), style=self.STYLES["task_details"]["style"], ) ) @@ -603,7 +642,7 @@ def _present_constellation_info(self, constellation: Any) -> None: self.console.print( Panel( deps_text, - title=self.STYLES["dependencies"]["title"], + title=self._safe_text(self.STYLES["dependencies"]["title"]), style=self.STYLES["dependencies"]["style"], ) ) @@ -623,7 +662,9 @@ def present_action_list(self, actions: Any, success_only: bool = False) -> None: from aip.messages import ResultStatus if not actions or not actions.actions: - self.console.print("ℹ️ No actions to display", style="dim") + self.console.print( + self._safe_text("ℹ️ No actions to display"), style="dim" + ) return # Filter actions based on success_only @@ -634,7 +675,9 @@ def present_action_list(self, actions: Any, success_only: bool = False) -> None: ] if not filtered_actions: - self.console.print("ℹ️ No actions to display", style="dim") + self.console.print( + self._safe_text("ℹ️ No actions to display"), style="dim" + ) return # Count successful and failed actions @@ -646,11 +689,13 @@ def present_action_list(self, actions: Any, success_only: bool = False) -> None: # Print header self.console.print() header_text = f"βš’οΈ Action Execution Results ({len(filtered_actions)} action{'s' if len(filtered_actions) != 1 else ''})" + encoding = (self.console.encoding or "").lower() + header_char = "═" if "utf" in encoding else "=" self.console.print( Rule( - header_text, + self._safe_text(header_text), style="bright_blue bold", - characters="═", + characters=header_char, ) ) @@ -662,10 +707,11 @@ def present_action_list(self, actions: Any, success_only: bool = False) -> None: self._print_action_summary(success_count, failed_count, actions.status) # Print footer + footer_char = "─" if "utf" in encoding else "-" self.console.print( Rule( style="dim", - characters="─", + characters=footer_char, ) ) self.console.print() @@ -679,7 +725,9 @@ def present_constellation_editing_actions(self, actions: Any) -> None: from aip.messages import ResultStatus if not actions or not actions.actions: - self.console.print("ℹ️ No actions to display", style="dim") + self.console.print( + self._safe_text("ℹ️ No actions to display"), style="dim" + ) return # Count successful and failed actions @@ -690,7 +738,10 @@ def present_constellation_editing_actions(self, actions: Any) -> None: # Create header header = Text() - header.append("πŸ”§ Constellation Editing Operations", style="bold cyan") + if "utf" in (self.console.encoding or "").lower(): + header.append("πŸ”§ Constellation Editing Operations", style="bold cyan") + else: + header.append("Constellation Editing Operations", style="bold cyan") header.append( f" ({len(actions.actions)} action{'s' if len(actions.actions) > 1 else ''})", style="dim", @@ -874,8 +925,8 @@ def _display_agent_comment(self, comment: str) -> None: if comment: self.console.print( Panel( - comment, - title=self.STYLES["comment"]["title"], + self._safe_text(comment), + title=self._safe_text(self.STYLES["comment"]["title"]), style=self.STYLES["comment"]["style"], ) ) @@ -891,16 +942,17 @@ def _print_single_action(self, idx: int, action: Any) -> None: from aip.messages import ResultStatus # Determine status icon and color + encoding = (self.console.encoding or "").lower() if action.result.status == ResultStatus.SUCCESS: - status_icon = "βœ…" + status_icon = "βœ…" if "utf" in encoding else "OK" status_color = "green" border_style = "green" elif action.result.status == ResultStatus.FAILURE: - status_icon = "❌" + status_icon = "❌" if "utf" in encoding else "FAIL" status_color = "red" border_style = "red" else: - status_icon = "⏸️" + status_icon = "⏸️" if "utf" in encoding else "WAIT" status_color = "yellow" border_style = "yellow" @@ -953,21 +1005,29 @@ def _print_single_action(self, idx: int, action: Any) -> None: # Show result details if available if action.result.result and str(action.result.result).strip(): result_text = Text() - result_text.append(" └─ Result: ", style="dim") + if "utf" in (self.console.encoding or "").lower(): + result_prefix = " └─ Result: " + else: + result_prefix = " - Result: " + result_text.append(result_prefix, style="dim") result_str = str(action.result.result) if len(result_str) > 500: result_str = result_str[:497] + "..." - result_text.append(result_str, style="bright_black") + result_text.append(self._safe_text(result_str), style="bright_black") self.console.print(result_text) # Show error if failed if action.result.status == ResultStatus.FAILURE and action.result.error: error_text = Text() - error_text.append(" └─ Error: ", style="red dim") + if "utf" in (self.console.encoding or "").lower(): + error_prefix = " └─ Error: " + else: + error_prefix = " - Error: " + error_text.append(error_prefix, style="red dim") error_str = str(action.result.error) if len(error_str) > 100: error_str = error_str[:97] + "..." - error_text.append(error_str, style="red") + error_text.append(self._safe_text(error_str), style="red") self.console.print(error_text) def _print_action_summary( @@ -986,25 +1046,33 @@ def _print_action_summary( summary.add_column("Label", style="bold") summary.add_column("Value") + encoding = (self.console.encoding or "").lower() + success_label = "βœ… Successful:" if "utf" in encoding else "Successful:" + failed_label = "❌ Failed:" if "utf" in encoding else "Failed:" + status_label = "πŸ“Š Status:" if "utf" in encoding else "Status:" + # Success count if success_count > 0: success_text = Text() success_text.append(str(success_count), style="green bold") success_text.append(" succeeded", style="green") - summary.add_row("βœ… Successful:", success_text) + summary.add_row(success_label, success_text) # Failed count if failed_count > 0: failed_text = Text() failed_text.append(str(failed_count), style="red bold") failed_text.append(" failed", style="red") - summary.add_row("❌ Failed:", failed_text) + summary.add_row(failed_label, failed_text) # Final status status_style = "green" if status in ["FINISH", "COMPLETED"] else "yellow" - status_emoji = "🏁" if status == "FINISH" else "πŸ”„" - status_text = Text(f"{status_emoji} {status}", style=f"{status_style} bold") - summary.add_row("πŸ“Š Status:", status_text) + if "utf" in encoding: + status_emoji = "🏁" if status == "FINISH" else "πŸ”„" + status_text = Text(f"{status_emoji} {status}", style=f"{status_style} bold") + else: + status_text = Text(f"{status}", style=f"{status_style} bold") + summary.add_row(status_label, status_text) self.console.print() self.console.print( @@ -1028,11 +1096,18 @@ def present_evaluation_agent_response( # Print response header self._print_response_header("EvaluationAgent") - emoji_map = { - "yes": "βœ…", - "no": "❌", - "unsure": "❓", - } + if "utf" in (self.console.encoding or "").lower(): + emoji_map = { + "yes": "βœ…", + "no": "❌", + "unsure": "❓", + } + else: + emoji_map = { + "yes": "YES", + "no": "NO", + "unsure": "UNSURE", + } complete = emoji_map.get(response.complete, response.complete) sub_scores = response.sub_scores or [] @@ -1041,7 +1116,7 @@ def present_evaluation_agent_response( # Sub-scores table if sub_scores: table = Table( - title=self.STYLES["evaluation"]["sub_scores"]["title"], + title=self._safe_text(self.STYLES["evaluation"]["sub_scores"]["title"]), show_lines=True, style=self.STYLES["evaluation"]["sub_scores"]["style"], ) @@ -1056,8 +1131,8 @@ def present_evaluation_agent_response( # Task complete self.console.print( Panel( - f"{complete}", - title=self.STYLES["evaluation"]["task_complete"]["title"], + self._safe_text(f"{complete}"), + title=self._safe_text(self.STYLES["evaluation"]["task_complete"]["title"]), style=self.STYLES["evaluation"]["task_complete"]["style"], ) ) @@ -1066,8 +1141,8 @@ def present_evaluation_agent_response( if reason: self.console.print( Panel( - reason, - title=self.STYLES["evaluation"]["reason"]["title"], + self._safe_text(reason), + title=self._safe_text(self.STYLES["evaluation"]["reason"]["title"]), style=self.STYLES["evaluation"]["reason"]["style"], ) ) diff --git a/ufo/agents/processors/app_agent_processor.py b/ufo/agents/processors/app_agent_processor.py index 1c16aa9c8..09fd18a90 100644 --- a/ufo/agents/processors/app_agent_processor.py +++ b/ufo/agents/processors/app_agent_processor.py @@ -36,6 +36,16 @@ console = Console() + +def _safe_console_text(text: str) -> str: + """ + Avoid UnicodeEncodeError on legacy Windows consoles by stripping emoji. + """ + encoding = (console.encoding or "").lower() + if "utf" in encoding: + return text + return text.encode("ascii", "ignore").decode("ascii") + if TYPE_CHECKING: from ufo.agents.agent.app_agent import AppAgent from ufo.agents.processors.core.processor_framework import ProcessingResult @@ -155,8 +165,10 @@ async def before_process( round_step = context.get("round_step") request = context.get("request") - panel_title = f"πŸš€ Round {round_num + 1}, Step {round_step + 1}, Agent: {processor.agent.name}" - panel_content = self.starting_message(context) + panel_title = _safe_console_text( + f"πŸš€ Round {round_num + 1}, Step {round_step + 1}, Agent: {processor.agent.name}" + ) + panel_content = _safe_console_text(self.starting_message(context)) console.print(Panel(panel_content, title=panel_title, style="magenta")) @@ -220,8 +232,10 @@ async def after_process( # Display user-friendly completion message (maintaining original UX) if subtask and application_process_name: console.print( - f"βœ… AppAgent: Successfully completed subtask '{subtask}' " - f"on application '{application_process_name}'", + _safe_console_text( + f"βœ… AppAgent: Successfully completed subtask '{subtask}' " + f"on application '{application_process_name}'" + ), style="green", ) else: @@ -233,7 +247,10 @@ async def after_process( # Display user-friendly error message (maintaining original UX) console.print( - f"❌ AppAgent: Processing failed - {result.error}", style="red" + _safe_console_text( + f"❌ AppAgent: Processing failed - {result.error}" + ), + style="red", ) async def on_error(self, processor: ProcessorTemplate, error: Exception) -> None: @@ -250,4 +267,7 @@ async def on_error(self, processor: ProcessorTemplate, error: Exception) -> None await super().on_error(processor, error) # Display user-friendly error message (maintaining original UX) - console.print(f"❌ AppAgent: Encountered error - {str(error)}", style="red") + console.print( + _safe_console_text(f"❌ AppAgent: Encountered error - {str(error)}"), + style="red", + ) diff --git a/ufo/agents/processors/host_agent_processor.py b/ufo/agents/processors/host_agent_processor.py index 234332f29..dd3bce24e 100644 --- a/ufo/agents/processors/host_agent_processor.py +++ b/ufo/agents/processors/host_agent_processor.py @@ -49,6 +49,16 @@ # Load configuration ufo_config = get_ufo_config() + +def _safe_console_text(text: str) -> str: + """ + Avoid UnicodeEncodeError on legacy Windows consoles by stripping emoji. + """ + encoding = (console.encoding or "").lower() + if "utf" in encoding: + return text + return text.encode("ascii", "ignore").decode("ascii") + if TYPE_CHECKING: from ufo.agents.agent.host_agent import HostAgent @@ -214,12 +224,16 @@ async def before_process( # Display colored progress message for user feedback (maintaining original UX) # This has been replaced with Rich Panel display below - panel_title = f"πŸš€ Round {round_num + 1}, Step {round_step + 1}, Agent: {processor.agent.name}" + panel_title = _safe_console_text( + f"πŸš€ Round {round_num + 1}, Step {round_step + 1}, Agent: {processor.agent.name}" + ) panel_content = ( f"Analyzing user intent and decomposing request of `{request}`..." ) - console.print(Panel(panel_content, title=panel_title, style="magenta")) + console.print( + Panel(_safe_console_text(panel_content), title=panel_title, style="magenta") + ) # Log available context data for debugging if self.logger.isEnabledFor(logging.DEBUG): @@ -260,8 +274,8 @@ async def after_process( target_name = selected_app or assigned_agent console.print( Panel( - f"Successfully selected target '{target_name}'", - title="βœ… HostAgent", + _safe_console_text(f"Successfully selected target '{target_name}'"), + title=_safe_console_text("βœ… HostAgent"), style="green", ) ) @@ -275,8 +289,8 @@ async def after_process( # Display user-friendly error message (maintaining original UX) console.print( Panel( - f"Processing failed - {result.error}", - title="❌ HostAgent", + _safe_console_text(f"Processing failed - {result.error}"), + title=_safe_console_text("❌ HostAgent"), style="red", ) ) @@ -292,8 +306,8 @@ async def on_error(self, processor: ProcessorTemplate, error: Exception) -> None console.print( Panel( - f"Encountered error - {str(error)}", - title="❌ HostAgent", + _safe_console_text(f"Encountered error - {str(error)}"), + title=_safe_console_text("❌ HostAgent"), style="red", ) ) diff --git a/ufo/agents/processors/strategies/app_agent_processing_strategy.py b/ufo/agents/processors/strategies/app_agent_processing_strategy.py index 78087eb3a..2e641e641 100644 --- a/ufo/agents/processors/strategies/app_agent_processing_strategy.py +++ b/ufo/agents/processors/strategies/app_agent_processing_strategy.py @@ -210,13 +210,41 @@ async def _capture_app_screenshot( raise ValueError("Failed to capture window screenshot") clean_screenshot_url = result[0].result - utils.save_image_string(clean_screenshot_url, save_path) + if ( + not isinstance(clean_screenshot_url, str) + or not clean_screenshot_url.startswith("data:image/") + ): + self.logger.warning( + "Window screenshot capture returned invalid data; falling back to desktop capture." + ) + clean_screenshot_url = await self._capture_desktop_screenshot( + save_path, command_dispatcher + ) + return clean_screenshot_url + + saved_image = utils.save_image_string(clean_screenshot_url, save_path) + if ( + not saved_image + or saved_image.size[0] <= 1 + or saved_image.size[1] <= 1 + ): + self.logger.warning( + "Window screenshot capture produced a tiny image; falling back to desktop capture." + ) + clean_screenshot_url = await self._capture_desktop_screenshot( + save_path, command_dispatcher + ) + return clean_screenshot_url + self.logger.info(f"Clean screenshot saved to: {save_path}") return clean_screenshot_url except Exception as e: - raise Exception(f"Failed to capture app screenshot: {str(e)}") + self.logger.error(f"Failed to capture app screenshot: {str(e)}; using empty placeholder") + # Return the empty placeholder instead of crashing the whole pipeline + from ufo.automator.ui_control.screenshot import PhotographerFacade + return PhotographerFacade._empty_image_string async def _get_application_window_info( self, command_dispatcher: BasicCommandDispatcher @@ -318,6 +346,7 @@ async def _capture_desktop_screenshot( # include_last_screenshot = configs.get("INCLUDE_LAST_SCREENSHOT", False) # if include_last_screenshot: + desktop_screenshot_url = "" if command_dispatcher: # Execute desktop screenshot command result = await command_dispatcher.execute_commands( @@ -332,14 +361,65 @@ async def _capture_desktop_screenshot( if result and result[0].result: desktop_screenshot_url = result[0].result - utils.save_image_string(desktop_screenshot_url, save_path) + if not isinstance(desktop_screenshot_url, str) or not desktop_screenshot_url.startswith("data:image/"): + raise RuntimeError("Desktop screenshot capture returned invalid image") + saved_image = utils.save_image_string( + desktop_screenshot_url, save_path + ) + if ( + not saved_image + or saved_image.size[0] <= 1 + or saved_image.size[1] <= 1 + ): + self.logger.warning( + "Desktop screenshot capture produced a tiny image; retrying with primary screen only." + ) + result = await command_dispatcher.execute_commands( + [ + Command( + tool_name="capture_desktop_screenshot", + parameters={"all_screens": False}, + tool_type="data_collection", + ) + ] + ) + if ( + not result + or not result[0].result + or result[0].status != ResultStatus.SUCCESS + ): + raise RuntimeError( + "Desktop screenshot retry returned empty result" + ) + desktop_screenshot_url = result[0].result + if not isinstance(desktop_screenshot_url, str) or not desktop_screenshot_url.startswith("data:image/"): + raise RuntimeError( + "Desktop screenshot retry returned invalid image" + ) + saved_image = utils.save_image_string( + desktop_screenshot_url, save_path + ) + if ( + not saved_image + or saved_image.size[0] <= 1 + or saved_image.size[1] <= 1 + ): + raise RuntimeError( + "Desktop screenshot retry produced a tiny image" + ) self.logger.info(f"Desktop screenshot saved to: {save_path}") + else: + raise RuntimeError("Desktop screenshot capture returned empty result") return desktop_screenshot_url except Exception as e: - self.logger.warning(f"Desktop screenshot capture failed: {str(e)}") - return "" + self.logger.warning( + f"Desktop screenshot capture failed, using empty image: {str(e)}" + ) + desktop_screenshot_url = utils._empty_image_string + utils.save_image_string(desktop_screenshot_url, save_path) + return desktop_screenshot_url @depends_on("clean_screenshot_path", "application_window_info") diff --git a/ufo/agents/processors/strategies/host_agent_processing_strategy.py b/ufo/agents/processors/strategies/host_agent_processing_strategy.py index 2f6c1edb4..dd8da981c 100644 --- a/ufo/agents/processors/strategies/host_agent_processing_strategy.py +++ b/ufo/agents/processors/strategies/host_agent_processing_strategy.py @@ -169,15 +169,63 @@ async def _capture_desktop_screenshot( raise RuntimeError("Screenshot capture returned empty result") desktop_screenshot_url = result[0].result + if not isinstance(desktop_screenshot_url, str) or not desktop_screenshot_url.startswith("data:image/"): + raise RuntimeError("Screenshot capture returned invalid image data") # Save screenshot to file - utils.save_image_string(desktop_screenshot_url, save_path) + saved_image = utils.save_image_string(desktop_screenshot_url, save_path) + if ( + not saved_image + or saved_image.size[0] <= 1 + or saved_image.size[1] <= 1 + ): + self.logger.warning( + "Desktop screenshot capture produced a tiny image; retrying with primary screen only." + ) + result = await command_dispatcher.execute_commands( + [ + Command( + tool_name="capture_desktop_screenshot", + parameters={"all_screens": False}, + tool_type="data_collection", + ) + ] + ) + if ( + not result + or not result[0].result + or result[0].status != ResultStatus.SUCCESS + ): + raise RuntimeError("Desktop screenshot retry returned empty result") + + desktop_screenshot_url = result[0].result + if not isinstance(desktop_screenshot_url, str) or not desktop_screenshot_url.startswith("data:image/"): + raise RuntimeError( + "Desktop screenshot retry returned invalid image data" + ) + saved_image = utils.save_image_string( + desktop_screenshot_url, save_path + ) + if ( + not saved_image + or saved_image.size[0] <= 1 + or saved_image.size[1] <= 1 + ): + raise RuntimeError( + "Desktop screenshot retry produced a tiny image" + ) + self.logger.info(f"Desktop screenshot saved to: {save_path}") return desktop_screenshot_url except Exception as e: - raise Exception(f"Failed to capture desktop screenshot: {str(e)}") + self.logger.warning( + f"Failed to capture desktop screenshot, using empty image: {str(e)}" + ) + desktop_screenshot_url = utils._empty_image_string + utils.save_image_string(desktop_screenshot_url, save_path) + return desktop_screenshot_url async def _get_desktop_application_info( self, command_dispatcher: BasicCommandDispatcher diff --git a/ufo/automator/ui_control/controller.py b/ufo/automator/ui_control/controller.py index 24ba147d0..9391b9f30 100644 --- a/ufo/automator/ui_control/controller.py +++ b/ufo/automator/ui_control/controller.py @@ -205,34 +205,56 @@ def set_edit_text(self, params: Dict[str, str]) -> str: args = {"keys": text, "pause": inter_key_pause, "with_spaces": True} try: result = self.atomic_execution(method_name, args) - if ( - method_name == "set_text" - and args["text"] not in self.control.window_text() - ): - raise Exception(f"Failed to use set_text: {args['text']}") + if isinstance(result, str) and result.startswith("An error occurred"): + raise Exception(result) + if method_name in ["set_text", "set_edit_text"]: + expected_text = args.get("text", "") + if expected_text and expected_text not in self.control.window_text(): + raise Exception( + f"Failed to use {method_name}: {expected_text}" + ) if ufo_config.system.input_text_enter and method_name in [ "type_keys", "set_text", + "set_edit_text", ]: self.atomic_execution("type_keys", params={"keys": "{ENTER}"}) return result except Exception as e: - if method_name == "set_text": + if method_name == "set_text" or method_name == "set_edit_text": logger.warning( f"{self.control} doesn't have a method named {method_name}, trying default input method" ) - method_name = "type_keys" clear_text_keys = "^a{BACKSPACE}" - text_to_type = args["text"] - keys_to_send = clear_text_keys + text_to_type - method_name = "type_keys" - args = { - "keys": keys_to_send, - "pause": inter_key_pause, - "with_spaces": True, - } - return self.atomic_execution(method_name, args) + text_to_type = args.get("text", "") + keys_to_send = clear_text_keys + TextTransformer.transform_text( + text_to_type, "all" + ) + try: + args = { + "keys": keys_to_send, + "pause": inter_key_pause, + "with_spaces": True, + } + type_keys_result = self.atomic_execution("type_keys", args) + if ( + isinstance(type_keys_result, str) + and type_keys_result.startswith("An error occurred") + ): + raise RuntimeError(type_keys_result) + return type_keys_result + except Exception: + # Last-resort fallback: use pyautogui typing + try: + if self.control: + self.control.set_focus() + pyautogui.hotkey("ctrl", "a") + pyautogui.press("backspace") + pyautogui.write(text_to_type, interval=inter_key_pause) + return f"Typed text via fallback: {text_to_type}" + except Exception as fallback_error: + return f"An error occurred: {fallback_error}" else: return f"An error occurred: {e}" @@ -249,9 +271,20 @@ def keyboard_input(self, params: Dict[str, str]) -> str: if control_focus: self.control.set_focus() - self.atomic_execution("type_keys", {"keys": keys}) + result = self.atomic_execution("type_keys", {"keys": keys}) else: - self.application.type_keys(keys=keys) + try: + self.application.type_keys(keys=keys) + result = "" + except Exception as e: + result = f"An error occurred: {e}" + if isinstance(result, str) and result.startswith("An error occurred"): + try: + if control_focus and self.control: + self.control.set_focus() + pyautogui.write(keys, interval=ufo_config.system.input_text_inter_key_pause) + except Exception as fallback_error: + return f"An error occurred: {fallback_error}" return keys def key_press(self, params: Dict[str, str]) -> str: diff --git a/ufo/automator/ui_control/screenshot.py b/ufo/automator/ui_control/screenshot.py index 34af17ee3..817f1a1fa 100644 --- a/ufo/automator/ui_control/screenshot.py +++ b/ufo/automator/ui_control/screenshot.py @@ -81,12 +81,50 @@ def __init__(self, control: UIAWrapper): def capture(self, save_path: str = None, scalar: List[int] = None) -> Image.Image: """ - Capture a screenshot. + Capture a screenshot of the control window. + Falls back through: pywinauto -> PrintWindow -> desktop screenshot. :param save_path: The path to save the screenshot. :return: The screenshot. """ - # Capture single window screenshot - screenshot = self.control.capture_as_image() + screenshot = None + + # Attempt 1: capture via pywinauto + try: + screenshot = self.control.capture_as_image() + except Exception as e: + logger.warning(f"control.capture_as_image() failed: {e}") + + # Validate the captured image + if screenshot is not None: + try: + w, h = screenshot.size + if w <= 1 or h <= 1: + logger.warning("control.capture_as_image() returned a tiny image, treating as invalid") + screenshot = None + except Exception: + screenshot = None + + # Attempt 2: PrintWindow API (works on disconnected RDP sessions) + if screenshot is None: + try: + hwnd = self.control.handle + if hwnd: + logger.info("Trying PrintWindow for window capture (RDP-safe)") + screenshot = _win32_print_window(hwnd) + if screenshot is not None: + w, h = screenshot.size + if w <= 1 or h <= 1 or screenshot.getbbox() is None: + logger.warning("PrintWindow returned empty/tiny image") + screenshot = None + except Exception as e: + logger.warning(f"PrintWindow fallback failed: {e}") + + # Attempt 3: fall back to desktop capture + if screenshot is None: + logger.info("Falling back to desktop screenshot for window capture") + desktop = DesktopPhotographer(all_screens=False) + screenshot = desktop.capture() + if scalar is not None: screenshot = self.rescale_image(screenshot, scalar) @@ -95,6 +133,160 @@ def capture(self, save_path: str = None, scalar: List[int] = None) -> Image.Imag return screenshot +def _win32_print_window(hwnd: int) -> Optional[Image.Image]: + """ + Capture a window using the PrintWindow API. + This works even on disconnected RDP sessions because PrintWindow asks the + window to paint itself to a device context rather than reading from the + screen buffer (which doesn't exist when RDP is disconnected). + :param hwnd: The window handle to capture. + :return: A PIL Image of the window, or None on failure. + """ + try: + import ctypes + import win32gui + import win32ui + import win32con + + # Get window dimensions + rect = win32gui.GetWindowRect(hwnd) + width = rect[2] - rect[0] + height = rect[3] - rect[1] + + if width <= 0 or height <= 0: + return None + + hwnd_dc = win32gui.GetWindowDC(hwnd) + mfc_dc = win32ui.CreateDCFromHandle(hwnd_dc) + save_dc = mfc_dc.CreateCompatibleDC() + + bmp = win32ui.CreateBitmap() + bmp.CreateCompatibleBitmap(mfc_dc, width, height) + save_dc.SelectObject(bmp) + + # PW_RENDERFULLCONTENT = 2 β€” works on Windows 8.1+ and captures + # the full content even when the window is occluded or off-screen. + PW_RENDERFULLCONTENT = 2 + result = ctypes.windll.user32.PrintWindow(hwnd, save_dc.GetSafeHdc(), PW_RENDERFULLCONTENT) + + if not result: + # Fallback to PW_CLIENTONLY = 1 + result = ctypes.windll.user32.PrintWindow(hwnd, save_dc.GetSafeHdc(), 1) + + if not result: + save_dc.DeleteDC() + mfc_dc.DeleteDC() + win32gui.ReleaseDC(hwnd, hwnd_dc) + win32gui.DeleteObject(bmp.GetHandle()) + return None + + bmpinfo = bmp.GetInfo() + bmpstr = bmp.GetBitmapBits(True) + screenshot = Image.frombuffer( + "RGB", + (bmpinfo["bmWidth"], bmpinfo["bmHeight"]), + bmpstr, + "raw", + "BGRX", + 0, + 1, + ) + + # Cleanup GDI objects + save_dc.DeleteDC() + mfc_dc.DeleteDC() + win32gui.ReleaseDC(hwnd, hwnd_dc) + win32gui.DeleteObject(bmp.GetHandle()) + + return screenshot + except Exception as e: + logger.warning(f"PrintWindow capture failed for hwnd={hwnd}: {e}") + return None + + +def _win32_grab_screen() -> Optional[Image.Image]: + """ + Fallback screen capture using win32 APIs when PIL ImageGrab fails. + Tries BitBlt first (fast), then PrintWindow on the desktop window + (works on disconnected RDP sessions). + :return: A PIL Image of the screen, or None on failure. + """ + # Attempt 1: BitBlt from desktop DC (fast, but fails on disconnected RDP) + try: + import win32gui + import win32ui + import win32con + import win32api + + width = win32api.GetSystemMetrics(win32con.SM_CXSCREEN) + height = win32api.GetSystemMetrics(win32con.SM_CYSCREEN) + + hdesktop = win32gui.GetDesktopWindow() + desktop_dc = win32gui.GetWindowDC(hdesktop) + img_dc = win32ui.CreateDCFromHandle(desktop_dc) + mem_dc = img_dc.CreateCompatibleDC() + + screenshot_bmp = win32ui.CreateBitmap() + screenshot_bmp.CreateCompatibleBitmap(img_dc, width, height) + mem_dc.SelectObject(screenshot_bmp) + mem_dc.BitBlt((0, 0), (width, height), img_dc, (0, 0), win32con.SRCCOPY) + + bmpinfo = screenshot_bmp.GetInfo() + bmpstr = screenshot_bmp.GetBitmapBits(True) + screenshot = Image.frombuffer( + "RGB", + (bmpinfo["bmWidth"], bmpinfo["bmHeight"]), + bmpstr, + "raw", + "BGRX", + 0, + 1, + ) + + mem_dc.DeleteDC() + img_dc.DeleteDC() + win32gui.ReleaseDC(hdesktop, desktop_dc) + win32gui.DeleteObject(screenshot_bmp.GetHandle()) + + # Validate: check it's not all-black (common on disconnected RDP) + if screenshot.getbbox() is not None: + return screenshot + else: + logger.warning("BitBlt returned all-black image (likely disconnected RDP)") + except Exception as e: + logger.warning(f"win32 BitBlt screen grab failed: {e}") + + # Attempt 2: PrintWindow on the desktop window + try: + import win32gui + hdesktop = win32gui.GetDesktopWindow() + screenshot = _win32_print_window(hdesktop) + if screenshot is not None and screenshot.getbbox() is not None: + return screenshot + else: + logger.warning("PrintWindow on desktop returned empty image") + except Exception as e: + logger.warning(f"PrintWindow desktop capture failed: {e}") + + # Attempt 3: PrintWindow on the foreground window as a best-effort + # desktop substitute (works on disconnected RDP for GUI windows) + try: + import win32gui + fg_hwnd = win32gui.GetForegroundWindow() + if fg_hwnd and fg_hwnd != 0: + logger.info("Trying PrintWindow on foreground window as desktop fallback") + screenshot = _win32_print_window(fg_hwnd) + if screenshot is not None and screenshot.getbbox() is not None: + return screenshot + else: + logger.warning("PrintWindow on foreground window returned empty image") + except Exception as e: + logger.warning(f"PrintWindow foreground window capture failed: {e}") + + logger.error("All win32 screen grab methods failed") + return None + + class DesktopPhotographer(Photographer): """ Class to capture the desktop screenshot. @@ -109,11 +301,36 @@ def __init__(self, all_screens=True) -> None: def capture(self, save_path: str = None, scalar: List[int] = None) -> Image.Image: """ - Capture a screenshot. + Capture a screenshot with fallbacks. + Tries: ImageGrab(all_screens) -> ImageGrab(primary only) -> win32 API. :param save_path: The path to save the screenshot. :return: The screenshot. """ - screenshot = ImageGrab.grab(all_screens=self.all_screens) + screenshot = None + + # Attempt 1: ImageGrab with requested all_screens setting + try: + screenshot = ImageGrab.grab(all_screens=self.all_screens) + except Exception as e: + logger.warning(f"ImageGrab.grab(all_screens={self.all_screens}) failed: {e}") + + # Attempt 2: If all_screens was True, retry with primary screen only + if screenshot is None and self.all_screens: + try: + logger.info("Retrying screenshot with primary screen only") + screenshot = ImageGrab.grab(all_screens=False) + except Exception as e: + logger.warning(f"ImageGrab.grab(all_screens=False) also failed: {e}") + + # Attempt 3: win32 API fallback + if screenshot is None: + logger.info("Falling back to win32 API screen capture") + screenshot = _win32_grab_screen() + + if screenshot is None: + logger.error("All screenshot capture methods failed; returning 1x1 placeholder image") + screenshot = Image.new("RGB", (1, 1), (0, 0, 0)) + if scalar is not None: screenshot = self.rescale_image(screenshot, scalar) if save_path is not None and screenshot is not None: diff --git a/ufo/client/mcp/local_servers/ui_mcp_server.py b/ufo/client/mcp/local_servers/ui_mcp_server.py index d3879d71b..665a83efc 100644 --- a/ufo/client/mcp/local_servers/ui_mcp_server.py +++ b/ufo/client/mcp/local_servers/ui_mcp_server.py @@ -48,6 +48,8 @@ CONTROL_BACKEND = configs.get("CONTROL_BACKEND", ["uia"]) if configs else ["uia"] BACKEND = "win32" if "win32" in CONTROL_BACKEND else "uia" +logger = logging.getLogger(__name__) + def _get_control_rectangle(control: UIAWrapper) -> Optional[Rect]: """ @@ -815,13 +817,32 @@ def capture_window_screenshot() -> str: return "Error: No window selected" try: + screenshot = None + + # Attempt 1: capture the selected app window if ui_state.selected_app_window: - screenshot = ui_state.photographer.capture_app_window_screenshot( - ui_state.selected_app_window - ) - else: + try: + screenshot = ui_state.photographer.capture_app_window_screenshot( + ui_state.selected_app_window + ) + except Exception as win_err: + logger.warning(f"App window screenshot failed: {win_err}") + + # Validate screenshot + if screenshot is not None: + try: + w, h = screenshot.size + if w <= 1 or h <= 1: + logger.warning("App window screenshot too small, treating as invalid") + screenshot = None + except Exception: + screenshot = None + + # Attempt 2: fall back to desktop screenshot + if screenshot is None: + logger.info("Falling back to desktop screenshot") screenshot = ui_state.photographer.capture_desktop_screen_screenshot( - all_screens=True + all_screens=False ) # Encode as base64 @@ -841,18 +862,26 @@ def capture_desktop_screenshot(all_screens: bool = True) -> str: """ try: - # Capture desktop screenshot + # Capture desktop screenshot (DesktopPhotographer now has built-in fallback) screenshot = ui_state.photographer.capture_desktop_screen_screenshot( all_screens=all_screens ) + # Validate the result + if screenshot is not None: + w, h = screenshot.size + if w <= 1 or h <= 1: + logger.warning("Desktop screenshot returned placeholder image") + # Encode as base64 desktop_screen_data = ui_state.photographer.encode_image(screenshot) return desktop_screen_data except Exception as e: - raise ToolError(f"Failed to capture screenshot: {str(e)}") + logger.error(f"Desktop screenshot failed: {e}, returning empty image") + # Return the empty placeholder image instead of crashing + return ui_state.photographer._empty_image_string @data_mcp.tool() def get_ui_tree() -> Dict[str, Any]: diff --git a/ufo/llm/openai.py b/ufo/llm/openai.py index 2bdcf9a46..613ee7b4f 100644 --- a/ufo/llm/openai.py +++ b/ufo/llm/openai.py @@ -45,6 +45,7 @@ def __init__( self.json_schema_enabled = False self.logger = logging.getLogger(__name__) assert api_provider in ["openai", "aoai", "azure_ad"], "Invalid API Provider" + self.use_responses = bool(self.config_llm.get("USE_RESPONSES", False)) self.client: OpenAI = OpenAIService.get_openai_client( api_provider, @@ -55,30 +56,39 @@ def __init__( self.config_llm.get("API_VERSION", ""), aad_api_scope_base=self.config_llm.get("AAD_API_SCOPE_BASE", ""), aad_tenant_id=self.config_llm.get("AAD_TENANT_ID", ""), + use_responses=self.use_responses, ) self.model = self.config_llm["API_MODEL"] - # Try to automatically fix some config errors - while True: - try: - response = self.client.beta.chat.completions.parse( - model=self.model, - messages=[{"role": "user", "content": "Hello"}], - n=1, - response_format=HostAgentResponse, - ) - except openai.BadRequestError as e: - if ( - "'response_format' of type 'json_schema' is not supported" - in e.message - ): - self.logger.info( - f"Model {self.model} does not support Structured JSON Output feature. Switching to text mode.", + # Try to automatically fix some config errors (chat completions only) + if not self.use_responses: + while True: + try: + self.client.beta.chat.completions.parse( + model=self.model, + messages=[{"role": "user", "content": "Hello"}], + n=1, + response_format=HostAgentResponse, + ) + except openai.BadRequestError as e: + if ( + "'response_format' of type 'json_schema' is not supported" + in e.message + ): + self.logger.info( + f"Model {self.model} does not support Structured JSON Output feature. Switching to text mode.", + ) + self.config_llm["JSON_SCHEMA"] = False + self.json_schema_enabled = False + except (openai.NotFoundError, openai.AuthenticationError, openai.APIConnectionError, openai.APITimeoutError, openai.APIStatusError) as e: + self.logger.warning( + f"Startup probe for model {self.model} failed with {type(e).__name__}: {e}. " + f"Continuing without JSON schema validation." ) self.config_llm["JSON_SCHEMA"] = False self.json_schema_enabled = False - break # Exit the loop if no exception is raised + break # Exit the loop if no exception is raised def _chat_completion( self, @@ -108,6 +118,13 @@ def _chat_completion( top_p = top_p if top_p is not None else self.config["TOP_P"] try: + if self.use_responses: + return self._responses_completion( + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + top_p=top_p, + ) # Build base parameters base_params = { "model": self.model, @@ -211,6 +228,136 @@ def _chat_completion( # Handle API error, e.g. retry or log raise Exception(f"OpenAI API returned an API Error: {e}") + def _responses_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + top_p: Optional[float] = None, + ) -> Tuple[List[str], Optional[float]]: + """ + Generate a completion using the Responses API. + """ + inputs = self._messages_to_responses_input(messages) + + base_params: Dict[str, Any] = { + "model": self.model, + "input": inputs, + } + + # Apply generation parameters for non-reasoning models + if not self.config_llm.get("REASONING_MODEL", False): + base_params.update( + { + "temperature": temperature, + "top_p": top_p, + } + ) + + if max_tokens is not None: + base_params["max_output_tokens"] = max_tokens + + # Add response format if JSON schema is enabled + if self.json_schema_enabled: + response_format_mapping = { + AgentType.HOST: HostAgentResponse, + AgentType.APP: AppAgentResponse, + AgentType.EVALUATION: EvaluationResponse, + } + response_format = response_format_mapping.get(AgentType(self.agent_type)) + if response_format: + base_params["response_format"] = type_to_response_format_param( + response_format + ) + + try: + response = self.client.responses.create(**base_params) + except openai.BadRequestError as e: + # Fallback if response_format isn't supported on Responses API + if "response_format" in str(e).lower(): + base_params.pop("response_format", None) + response = self.client.responses.create(**base_params) + else: + raise + + response_dict = response.model_dump() if hasattr(response, "model_dump") else response + content_text = self._extract_responses_text(response_dict) + + usage = response_dict.get("usage", {}) if isinstance(response_dict, dict) else {} + input_tokens = usage.get("input_tokens", 0) + output_tokens = usage.get("output_tokens", 0) + + cost = self.get_cost_estimator( + self.api_type, + self.model, + self.prices, + input_tokens, + output_tokens, + ) + + return [content_text], cost + + @staticmethod + def _messages_to_responses_input( + messages: List[Dict[str, str]], + ) -> List[Dict[str, Any]]: + """ + Convert chat-style messages to Responses API input format. + """ + inputs: List[Dict[str, Any]] = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + if isinstance(content, list): + converted_parts: List[Dict[str, Any]] = [] + for part in content: + if not isinstance(part, dict): + continue + part_type = part.get("type") + if part_type == "text": + converted_parts.append( + {"type": "input_text", "text": part.get("text", "")} + ) + elif part_type in ["image_url", "input_image"]: + image_url = part.get("image_url", "") + if isinstance(image_url, dict): + image_url = image_url.get("url", "") + converted_parts.append( + {"type": "input_image", "image_url": image_url} + ) + else: + # Pass through other types (e.g., computer_screenshot) if already valid + converted_parts.append(part) + inputs.append({"role": role, "content": converted_parts}) + else: + inputs.append( + { + "role": role, + "content": [{"type": "input_text", "text": str(content)}], + } + ) + return inputs + + @staticmethod + def _extract_responses_text(response: Dict[str, Any]) -> str: + """ + Extract text content from a Responses API payload. + """ + output = response.get("output", []) + chunks: List[str] = [] + for item in output: + if not isinstance(item, dict): + continue + content = item.get("content", []) + for part in content: + if not isinstance(part, dict): + continue + if "text" in part: + chunks.append(part.get("text", "")) + elif part.get("type") in ["output_text", "text"]: + chunks.append(part.get("text", "")) + return "".join(chunks).strip() + def _chat_completion_operator( self, message: Dict[str, Any] = {}, @@ -267,6 +414,7 @@ def get_openai_client( api_version: Optional[str] = None, aad_api_scope_base: Optional[str] = None, aad_tenant_id: Optional[str] = None, + use_responses: bool = False, ) -> OpenAI: """ Create an OpenAI client based on the API type. @@ -299,6 +447,9 @@ def get_openai_client( api_version=api_version, azure_endpoint=api_base, api_key=api_key, + default_headers={"x-ms-enable-preview": "true"} + if use_responses + else {}, ) else: assert ( @@ -314,6 +465,9 @@ def get_openai_client( api_version=api_version, azure_endpoint=api_base, azure_ad_token_provider=token_provider, + default_headers={"x-ms-enable-preview": "true"} + if use_responses + else {}, ) return client diff --git a/ufo/module/basic.py b/ufo/module/basic.py index dc7d391b2..778e76a65 100644 --- a/ufo/module/basic.py +++ b/ufo/module/basic.py @@ -45,6 +45,16 @@ console = Console() +def _safe_console_text(text: str) -> str: + """ + Avoid UnicodeEncodeError on legacy Windows consoles by stripping emoji. + """ + encoding = (console.encoding or "").lower() + if "utf" in encoding: + return text + return text.encode("ascii", "ignore").decode("ascii") + + class FileWriter: """ Simple file writer that bypasses global logging settings. @@ -287,7 +297,9 @@ def print_cost(self) -> None: if isinstance(total_cost, float): formatted_cost = "${:.2f}".format(total_cost) console.print( - f"πŸ’° Request total cost for current round is {formatted_cost}", + _safe_console_text( + f"πŸ’° Request total cost for current round is {formatted_cost}" + ), style="yellow", ) @@ -721,7 +733,9 @@ def experience_saver(self) -> None: Save the current trajectory as agent experience. """ console.print( - "πŸ“š Summarizing and saving the execution flow as experience...", + _safe_console_text( + "πŸ“š Summarizing and saving the execution flow as experience..." + ), style="yellow", ) @@ -754,12 +768,16 @@ def print_cost(self) -> None: if isinstance(self.cost, float) and self.cost > 0: formatted_cost = "${:.2f}".format(self.cost) console.print( - f"πŸ’° Total request cost of the session: {formatted_cost}", + _safe_console_text( + f"πŸ’° Total request cost of the session: {formatted_cost}" + ), style="yellow", ) else: console.print( - f"ℹ️ Cost is not available for the model {ufo_config.host_agent.api_model} or {ufo_config.app_agent.api_model}.", + _safe_console_text( + f"ℹ️ Cost is not available for the model {ufo_config.host_agent.api_model} or {ufo_config.app_agent.api_model}." + ), style="yellow", ) self.logger.warning("Cost information is not available.") @@ -809,7 +827,7 @@ def evaluation(self) -> None: """ Evaluate the session. """ - console.print("πŸ“Š Evaluating the session...", style="yellow") + console.print(_safe_console_text("πŸ“Š Evaluating the session..."), style="yellow") is_visual = ufo_config.evaluation_agent.visual_mode diff --git a/ufo/prompter/eva_prompter.py b/ufo/prompter/eva_prompter.py index 4d0c2fc17..e3b8f0414 100644 --- a/ufo/prompter/eva_prompter.py +++ b/ufo/prompter/eva_prompter.py @@ -125,8 +125,9 @@ def user_content_construction_head_tail( screenshot_text = ["Initial Screenshot:", "Final Screenshot:"] for i, image in enumerate(head_tail_screenshots): - user_content.append({"type": "text", "text": screenshot_text[i]}) - user_content.append({"type": "image_url", "image_url": {"url": image}}) + if self._is_valid_screenshot_str(image): + user_content.append({"type": "text", "text": screenshot_text[i]}) + user_content.append({"type": "image_url", "image_url": {"url": image}}) user_content.append( { @@ -140,11 +141,55 @@ def user_content_construction_head_tail( return user_content + # Maximum number of images to include in evaluation to stay within API limits. + # Most APIs cap at 50 images; we leave headroom for the final screenshot. + MAX_EVAL_IMAGES = 40 + + @staticmethod + def _is_valid_screenshot(image) -> bool: + """ + Check whether a screenshot is a real capture (not a 1x1 placeholder). + :param image: PIL Image or None + :return: True if the image is usable for evaluation. + """ + if image is None: + return False + try: + w, h = image.size + if w <= 1 or h <= 1: + return False + # Also check for all-black images (common placeholder) + if image.getbbox() is None: + return False + except Exception: + return False + return True + + @staticmethod + def _is_valid_screenshot_str(screenshot_str: str) -> bool: + """ + Check whether a base64 screenshot string is a real image. + Rejects the well-known 1x1 empty placeholder. + """ + if not screenshot_str or not isinstance(screenshot_str, str): + return False + if not screenshot_str.startswith("data:image/"): + return False + # The known 1x1 placeholder base64 (both the one from utils and PhotographerFacade) + if "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk" in screenshot_str: + return False + # Very small base64 payloads are likely empty/broken + if len(screenshot_str) < 200: + return False + return True + def user_content_construction_all( self, log_path: str, request: str ) -> List[Dict[str, str]]: """ Construct the prompt for the EvaluationAgent with all screenshots. + Filters out placeholder/empty images and caps the total to avoid + hitting API image-count limits. :param log_path: The path of the log. :param request: The user request. return: The prompt for the EvaluationAgent. @@ -159,6 +204,7 @@ def user_content_construction_all( ) trajectory = self.load_logs(log_path) + image_count = 0 for log in trajectory.app_agent_log: @@ -167,30 +213,43 @@ def user_content_construction_all( if step is None: continue - if self.is_visual: + if self.is_visual and image_count < self.MAX_EVAL_IMAGES: - screenshot_image = log.get("ScreenshotImages").get( + screenshot_image = log.get("ScreenshotImages", {}).get( "selected_control_screenshot_path" ) - screenshot_str = ufo.utils.encode_image(screenshot_image) - user_content.append( - {"type": "image_url", "image_url": {"url": screenshot_str}} - ) + if self._is_valid_screenshot(screenshot_image): + screenshot_str = ufo.utils.encode_image(screenshot_image) + if self._is_valid_screenshot_str(screenshot_str): + user_content.append( + {"type": "image_url", "image_url": {"url": screenshot_str}} + ) + image_count += 1 step_trajectory = self.get_step_trajectory(log) user_content.append({"type": "text", "text": json.dumps(step_trajectory)}) if self.is_visual: - - user_content.append({"type": "text", "text": ""}) - screenshot_str = ufo.utils.encode_image(trajectory.final_screenshot_image) - + final_image = trajectory.final_screenshot_image + if self._is_valid_screenshot(final_image): + screenshot_str = ufo.utils.encode_image(final_image) + if self._is_valid_screenshot_str(screenshot_str): + user_content.append({"type": "text", "text": ""}) + user_content.append( + { + "type": "image_url", + "image_url": {"url": screenshot_str}, + } + ) + image_count += 1 + + if image_count == 0: user_content.append( { - "type": "image_url", - "image_url": {"url": screenshot_str}, + "type": "text", + "text": "", } ) diff --git a/ufo/trajectory/parser.py b/ufo/trajectory/parser.py index 7aa2e7d3b..477126347 100644 --- a/ufo/trajectory/parser.py +++ b/ufo/trajectory/parser.py @@ -116,11 +116,15 @@ def _load_single_screenshot( """ screenshot_log_path = step_log.get(key) - if screenshot_log_path is not None: + # Skip None and empty strings (empty string causes os.path.join to + # return the directory itself, leading to "is a directory" errors) + if screenshot_log_path is not None and screenshot_log_path.strip(): screenshot_file_name = os.path.basename(screenshot_log_path) + if not screenshot_file_name: + return None screenshot_file_path = os.path.join(self.file_path, screenshot_file_name) - if os.path.exists(screenshot_file_path): + if os.path.isfile(screenshot_file_path): screenshot = self.load_screenshot(screenshot_file_path) return screenshot else: diff --git a/ufo/utils/__init__.py b/ufo/utils/__init__.py index a679ef9e6..34a1da625 100644 --- a/ufo/utils/__init__.py +++ b/ufo/utils/__init__.py @@ -317,6 +317,9 @@ def load_image(image_path: str) -> Image.Image: :return: The image. """ try: + if os.path.isdir(image_path): + logger.warning(f"Image path {image_path} is a directory, not a file.") + return Image.new("RGB", (1, 1), color="white") if not os.path.exists(image_path): logger.warning(f"Image file {image_path} does not exist.") return Image.new("RGB", (1, 1), color="white") @@ -349,6 +352,8 @@ def save_image_string(image_string: str, save_path: str) -> Image.Image: :return: The saved image. """ try: + if not isinstance(image_string, str) or not image_string.startswith("data:image/"): + image_string = _empty_image_string # Ensure the directory exists save_dir = os.path.dirname(save_path) if (