From c8002d9220d4cb252f889225575831f56cdf8069 Mon Sep 17 00:00:00 2001 From: mulin_ Date: Sat, 4 Oct 2025 16:50:46 +0800 Subject: [PATCH 1/3] fix: make OSWorldACI env optional, prevent crashes in local CLI runs, safely skip code-agent when no environment/controller is present, and improve debugging transparency --- gui_agents/s3/agents/grounding.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/gui_agents/s3/agents/grounding.py b/gui_agents/s3/agents/grounding.py index 28288fab..dca4691f 100644 --- a/gui_agents/s3/agents/grounding.py +++ b/gui_agents/s3/agents/grounding.py @@ -179,7 +179,8 @@ def set_cell_values(new_cell_values: dict[str, str], app_name: str = "Untitled 1 class OSWorldACI(ACI): def __init__( self, - env, + env=None, + *, platform: str, engine_params_for_generation: Dict, engine_params_for_grounding: Dict, @@ -563,13 +564,29 @@ def call_code_agent(self, task: str = None): logger.info(f"Executing FULL TASK: {task_to_execute}") if task_to_execute: + controller = getattr(self.env, "controller", None) if self.env else None + + if controller is None: + logger.warning( + "Environment controller unavailable; skipping code agent execution." + ) + self.last_code_agent_result = { + "task_instruction": task_to_execute, + "completion_reason": "NO_ENV_CONTROLLER", + "summary": "Code agent execution skipped because no environment controller was provided.", + "execution_history": [], + "steps_executed": 0, + "budget": self.code_agent.budget, + } + return "import time; time.sleep(1)" + print("obs keys: ", self.obs.keys()) screenshot = self.obs.get("screenshot", "") if self.obs else "" logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}") logger.info("Executing code agent...") result = self.code_agent.execute( - task_to_execute, screenshot, self.env.controller + task_to_execute, screenshot, controller ) # Store the result for the worker to access From 1c5004cb3d86dfd8e989f52543a577ef85e2b02e Mon Sep 17 00:00:00 2001 From: mulin_ Date: Sun, 5 Oct 2025 10:53:59 +0800 Subject: [PATCH 2/3] Add environment self-check utility (agent-s_doctor) for pre-run and post-failure diagnostics --- README.md | 9 + self_check.py | 521 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 + 3 files changed, 532 insertions(+) create mode 100644 self_check.py diff --git a/README.md b/README.md index d46c67db..d146a1fd 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,15 @@ We support Azure OpenAI, Anthropic, Gemini, Open Router, and vLLM inference. See ### Grounding Models (Required) For optimal performance, we recommend [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) hosted on Hugging Face Inference Endpoints or another provider. See [Hugging Face Inference Endpoints](https://huggingface.co/learn/cookbook/en/enterprise_dedicated_endpoints) for setup instructions. +### ๐Ÿ” Self-check (Recommended) +Before running the agent, you can verify that your environment has the right dependencies, permissions, and API keys by running: + +```bash +agent-s_doctor +``` + +The tool highlights anything missing and suggests fixes. Use `agent-s_doctor --json` if you want to parse the results programmatically, and `agent-s_doctor --skip-screenshot` when running in a headless environment. + ## ๐Ÿš€ Usage diff --git a/self_check.py b/self_check.py new file mode 100644 index 00000000..a44fc793 --- /dev/null +++ b/self_check.py @@ -0,0 +1,521 @@ +"""Command-line self-check utility for Agent S dependencies and permissions.""" + +from __future__ import annotations + +import argparse +import json +import os +import platform +import shutil +import sys +from dataclasses import asdict, dataclass +from importlib import import_module, metadata +from importlib.metadata import PackageNotFoundError +from pathlib import Path +from typing import Dict, Iterable, List, Optional + +_STATUS_LABELS: Dict[str, str] = {"PASS": "PASS", "WARN": "WARN", "FAIL": "FAIL"} + + +@dataclass +class CheckResult: + """Outcome of a single self-check item.""" + + name: str + status: str + message: str + remedy: Optional[str] = None + + def to_dict(self) -> Dict[str, Optional[str]]: + """Convert the result into a serialisable dictionary.""" + + return asdict(self) + + +@dataclass +class DependencySpec: + """Specification describing how to import and label a dependency.""" + + name: str + import_name: str + package_name: Optional[str] = None + optional: bool = False + purpose: Optional[str] = None + + +CORE_DEPENDENCIES: Iterable[DependencySpec] = ( + DependencySpec( + name="pyautogui", + import_name="pyautogui", + purpose="Required for screenshot capture and input control", + ), + DependencySpec( + name="Pillow", + import_name="PIL", + package_name="Pillow", + purpose="Image processing backend for screenshots", + ), + DependencySpec( + name="numpy", + import_name="numpy", + purpose="Used across perception and planning modules", + ), + DependencySpec( + name="requests", + import_name="requests", + purpose="HTTP client for API integrations", + ), + DependencySpec( + name="openai", + import_name="openai", + purpose="OpenAI-compatible provider client", + ), + DependencySpec( + name="anthropic", + import_name="anthropic", + optional=True, + purpose="Required when using Anthropic models", + ), + DependencySpec( + name="tiktoken", + import_name="tiktoken", + purpose="Token counting utilities", + ), + DependencySpec( + name="paddleocr", + import_name="paddleocr", + optional=True, + purpose="OCR support for perception tasks", + ), + DependencySpec( + name="paddlepaddle", + import_name="paddle", + optional=True, + purpose="Deep learning runtime used by PaddleOCR", + ), + DependencySpec( + name="pytesseract", + import_name="pytesseract", + optional=True, + purpose="Fallback OCR engine", + ), + DependencySpec( + name="google-genai", + import_name="google.genai", + package_name="google-genai", + optional=True, + purpose="Needed for Google Gemini providers", + ), + DependencySpec( + name="selenium", + import_name="selenium", + optional=True, + purpose="Browser automation helpers", + ), +) + +_API_KEY_VARS: Dict[str, str] = { + "OPENAI_API_KEY": "Required for OpenAI-compatible providers (OpenAI, SiliconFlow, Fireworks, etc.)", + "ANTHROPIC_API_KEY": "Needed when running Anthropic models", + "TOGETHER_API_KEY": "Needed when using Together AI endpoints", + "SILICONFLOW_API_KEY": "Needed when using SiliconFlow endpoints", + "GOOGLE_API_KEY": "Needed when using Google Gemini", + "DEEPSEEK_API_KEY": "Needed when using DeepSeek", +} + + +def check_python_version() -> CheckResult: + """Validate that the interpreter version falls within the supported range.""" + + major, minor = sys.version_info[:2] + if 3 <= major: + if 9 <= minor <= 12 or major > 3: + return CheckResult( + name="Python version", + status="PASS", + message=f"Detected Python {major}.{minor}", + ) + return CheckResult( + name="Python version", + status="FAIL", + message=f"Python {major}.{minor} is unsupported. Agent S requires Python 3.9-3.12.", + remedy="Install Python 3.9-3.12 and recreate your virtual environment.", + ) + + +def check_dependency(spec: DependencySpec) -> CheckResult: + """Import a dependency and report whether it is available.""" + + package_key = spec.package_name or spec.name + try: + module = import_module(spec.import_name) + version: Optional[str] = None + try: + version = getattr(module, "__version__", None) + except Exception: + version = None + if version is None: + try: + version = metadata.version(package_key) + except PackageNotFoundError: + version = "unknown" + message = f"Found {package_key} {version}" if version else f"Found {package_key}" + if spec.purpose: + message += f" โ€” {spec.purpose}" + return CheckResult(name=f"Dependency: {spec.name}", status="PASS", message=message) + except ModuleNotFoundError: + status = "WARN" if spec.optional else "FAIL" + message = f"{spec.name} not installed." + if spec.purpose: + message += f" {spec.purpose}." + remedy = f"pip install {package_key}" if package_key else "Install missing dependency." + return CheckResult( + name=f"Dependency: {spec.name}", + status=status, + message=message, + remedy=remedy, + ) + except Exception as exc: # pragma: no cover - defensive guard + status = "WARN" if spec.optional else "FAIL" + return CheckResult( + name=f"Dependency: {spec.name}", + status=status, + message=f"Error importing {spec.name}: {exc}", + remedy=f"Reinstall {package_key} or inspect the stack trace.", + ) + + +def check_logs_directory(base_dir: Path) -> CheckResult: + """Verify that the logs directory exists and is writable.""" + + logs_dir = base_dir / "logs" + try: + logs_dir.mkdir(exist_ok=True) + test_file = logs_dir / ".write_test" + test_file.write_text("ok", encoding="utf-8") + test_file.unlink(missing_ok=True) + return CheckResult( + name="Filesystem permissions", + status="PASS", + message=f"logs/ directory is writable at {logs_dir.resolve()}", + ) + except Exception as exc: + return CheckResult( + name="Filesystem permissions", + status="FAIL", + message=f"Cannot write to {logs_dir}: {exc}", + remedy="Ensure the current user can create files in the project directory.", + ) + + +def check_api_keys() -> List[CheckResult]: + """Detect configured API keys for model and grounding providers.""" + + results: List[CheckResult] = [] + detected = [var for var in _API_KEY_VARS if os.environ.get(var)] + if detected: + results.append( + CheckResult( + name="Model API keys", + status="PASS", + message="Detected: " + ", ".join(detected), + ) + ) + else: + results.append( + CheckResult( + name="Model API keys", + status="WARN", + message="No LLM provider API keys detected in environment.", + remedy="Export OPENAI_API_KEY, ANTHROPIC_API_KEY, SILICONFLOW_API_KEY, or another provider key before running Agent S.", + ) + ) + hf_token = os.environ.get("HF_TOKEN") + if hf_token: + results.append( + CheckResult( + name="HF_TOKEN", + status="PASS", + message="Detected Hugging Face token (HF_TOKEN)", + ) + ) + else: + results.append( + CheckResult( + name="HF_TOKEN", + status="WARN", + message="HF_TOKEN not set. Required when hosting grounding models like UI-TARS on Hugging Face.", + remedy="Export HF_TOKEN with a Hugging Face access token that has Inference Endpoint permissions.", + ) + ) + return results + + +def check_macos_permissions() -> List[CheckResult]: + """Inspect macOS automation and screen recording permissions.""" + + results: List[CheckResult] = [] + try: + import Quartz # type: ignore + + accessibility_ok: Optional[bool] = None + try: + options = {Quartz.kAXTrustedCheckOptionPrompt: False} + accessibility_ok = Quartz.AXIsProcessTrustedWithOptions(options) + except AttributeError: + try: + accessibility_ok = Quartz.AXIsProcessTrusted() + except AttributeError: + accessibility_ok = None + if accessibility_ok is not None: + if accessibility_ok: + results.append( + CheckResult( + name="macOS Accessibility", + status="PASS", + message="Automation permission granted (Accessibility).", + ) + ) + else: + results.append( + CheckResult( + name="macOS Accessibility", + status="FAIL", + message="Accessibility permission not granted.", + remedy="Open System Settings โ†’ Privacy & Security โ†’ Accessibility and enable Terminal (or your Python IDE) for automation.", + ) + ) + else: + results.append( + CheckResult( + name="macOS Accessibility", + status="WARN", + message="Unable to verify Accessibility permission (AXIsProcessTrusted unavailable).", + remedy="Manually confirm in System Settings โ†’ Privacy & Security โ†’ Accessibility.", + ) + ) + + if hasattr(Quartz, "CGPreflightScreenCaptureAccess"): + screen_ok = Quartz.CGPreflightScreenCaptureAccess() + if screen_ok: + results.append( + CheckResult( + name="macOS Screen Recording", + status="PASS", + message="Screen recording permission granted.", + ) + ) + else: + results.append( + CheckResult( + name="macOS Screen Recording", + status="FAIL", + message="Screen recording permission not granted.", + remedy="Open System Settings โ†’ Privacy & Security โ†’ Screen Recording and allow Terminal (or your Python IDE).", + ) + ) + else: + results.append( + CheckResult( + name="macOS Screen Recording", + status="WARN", + message="Cannot verify screen recording permission on this macOS version.", + remedy="Manually check System Settings โ†’ Privacy & Security โ†’ Screen Recording.", + ) + ) + except ModuleNotFoundError: + results.append( + CheckResult( + name="macOS Permissions", + status="WARN", + message="pyobjc not installed; macOS permission checks skipped.", + remedy="pip install pyobjc to enable automated permission checks.", + ) + ) + except Exception as exc: # pragma: no cover - safety net + results.append( + CheckResult( + name="macOS Permissions", + status="WARN", + message=f"Unable to verify macOS permissions: {exc}", + remedy="Check Accessibility and Screen Recording permissions manually.", + ) + ) + return results + + +def check_linux_dependencies() -> List[CheckResult]: + """Confirm Linux-specific dependencies required by pyautogui.""" + + results: List[CheckResult] = [] + if shutil.which("scrot"): + results.append( + CheckResult( + name="Linux screenshot backend", + status="PASS", + message="Found 'scrot' command (required by pyautogui).", + ) + ) + else: + results.append( + CheckResult( + name="Linux screenshot backend", + status="WARN", + message="'scrot' not found. pyautogui screenshots may fail on Linux.", + remedy="Install scrot via your package manager (e.g., sudo apt install scrot).", + ) + ) + return results + + +def check_windows_dependencies() -> List[CheckResult]: + """Ensure Windows automation helpers are present.""" + + results: List[CheckResult] = [] + try: + import_module("pywinauto") + results.append( + CheckResult( + name="pywinauto", + status="PASS", + message="pywinauto available for Windows automation.", + ) + ) + except ModuleNotFoundError: + results.append( + CheckResult( + name="pywinauto", + status="WARN", + message="pywinauto not installed. Required for Windows automation features.", + remedy="pip install pywinauto", + ) + ) + try: + import_module("win32api") + results.append( + CheckResult( + name="pywin32", + status="PASS", + message="pywin32 available.", + ) + ) + except ModuleNotFoundError: + results.append( + CheckResult( + name="pywin32", + status="WARN", + message="pywin32 not installed. Required for Windows automation features.", + remedy="pip install pywin32", + ) + ) + return results + + +def check_screenshot(skip: bool = False) -> Optional[CheckResult]: + """Attempt to capture a screenshot, unless the user opted out.""" + + if skip: + return CheckResult( + name="Screenshot capture", + status="WARN", + message="Screenshot test skipped by user request.", + ) + try: + import pyautogui + + pyautogui.screenshot() + return CheckResult( + name="Screenshot capture", + status="PASS", + message="pyautogui.screenshot() succeeded.", + ) + except Exception as exc: + return CheckResult( + name="Screenshot capture", + status="WARN", + message=f"Screenshot capture failed: {exc}", + remedy="Grant screen recording permission and ensure a display is available.", + ) + + +def collect_checks(skip_screenshot: bool) -> List[CheckResult]: + """Aggregate all self-check results into a single list.""" + + results: List[CheckResult] = [] + results.append(check_python_version()) + results.extend(check_dependency(spec) for spec in CORE_DEPENDENCIES) + results.append(check_logs_directory(Path.cwd())) + results.extend(check_api_keys()) + + current_os = platform.system() + if current_os == "Darwin": + results.extend(check_macos_permissions()) + elif current_os == "Linux": + results.extend(check_linux_dependencies()) + elif current_os == "Windows": + results.extend(check_windows_dependencies()) + + screenshot_result = check_screenshot(skip_screenshot) + if screenshot_result: + results.append(screenshot_result) + + return results + + +def print_results(results: List[CheckResult]) -> None: + """Render self-check results in a tabular text format.""" + + name_width = max((len(result.name) for result in results), default=10) + header = f"{'Status':<8} {'Check':<{name_width}} Details" + separator = "-" * len(header) + print(separator) + print(header) + print(separator) + for result in results: + status_display = _STATUS_LABELS.get(result.status, result.status) + print(f"{status_display:<8} {result.name:<{name_width}} {result.message}") + if result.remedy: + print(f" {'':<{name_width}} โ†ช {result.remedy}") + print(separator) + + summary = { + "pass": sum(result.status == "PASS" for result in results), + "warn": sum(result.status == "WARN" for result in results), + "fail": sum(result.status == "FAIL" for result in results), + } + print( + f"Summary โ†’ PASS: {summary['pass']} ยท WARN: {summary['warn']} ยท FAIL: {summary['fail']}" + ) + + +def main(argv: Optional[List[str]] = None) -> int: + """Entry point for the Agent S self-check command.""" + + parser = argparse.ArgumentParser( + description="Run Agent S permission and dependency self-checks." + ) + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON instead of human-readable text.", + ) + parser.add_argument( + "--skip-screenshot", + action="store_true", + help="Skip the live screenshot capture test (for headless environments).", + ) + args = parser.parse_args(argv) + + results = collect_checks(skip_screenshot=args.skip_screenshot) + + if args.json: + payload = [result.to_dict() for result in results] + print(json.dumps(payload, indent=2, ensure_ascii=False)) + else: + print_results(results) + + return 1 if any(result.status == "FAIL" for result in results) else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/setup.py b/setup.py index c6f0b8a6..d71d5c9e 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ author="Simular AI", author_email="eric@simular.ai", packages=find_packages(), + py_modules=["self_check"], install_requires=[ "numpy", "backoff", @@ -36,6 +37,7 @@ entry_points={ "console_scripts": [ "agent_s=gui_agents.s3.cli_app:main", + "agent-s_doctor=self_check:main", ], }, classifiers=[ From 949ddc8625d6bbec1ca74a83f3f264f63babec98 Mon Sep 17 00:00:00 2001 From: mulin_ Date: Sun, 5 Oct 2025 11:00:43 +0800 Subject: [PATCH 3/3] syncing changes from remote repository --- gui_agents/s3/agents/grounding.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/gui_agents/s3/agents/grounding.py b/gui_agents/s3/agents/grounding.py index dca4691f..4f5c0ef3 100644 --- a/gui_agents/s3/agents/grounding.py +++ b/gui_agents/s3/agents/grounding.py @@ -179,8 +179,7 @@ def set_cell_values(new_cell_values: dict[str, str], app_name: str = "Untitled 1 class OSWorldACI(ACI): def __init__( self, - env=None, - *, + env, platform: str, engine_params_for_generation: Dict, engine_params_for_grounding: Dict, @@ -564,29 +563,13 @@ def call_code_agent(self, task: str = None): logger.info(f"Executing FULL TASK: {task_to_execute}") if task_to_execute: - controller = getattr(self.env, "controller", None) if self.env else None - - if controller is None: - logger.warning( - "Environment controller unavailable; skipping code agent execution." - ) - self.last_code_agent_result = { - "task_instruction": task_to_execute, - "completion_reason": "NO_ENV_CONTROLLER", - "summary": "Code agent execution skipped because no environment controller was provided.", - "execution_history": [], - "steps_executed": 0, - "budget": self.code_agent.budget, - } - return "import time; time.sleep(1)" - print("obs keys: ", self.obs.keys()) screenshot = self.obs.get("screenshot", "") if self.obs else "" logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}") logger.info("Executing code agent...") result = self.code_agent.execute( - task_to_execute, screenshot, controller + task_to_execute, screenshot, self.env.controller ) # Store the result for the worker to access @@ -669,4 +652,4 @@ def done( @agent_action def fail(self): """End the current task with a failure. Use this when you believe the entire task is impossible to complete.""" - return """FAIL""" + return """FAIL""" \ No newline at end of file