diff --git a/.env.example b/.env.example index fc7f5e5..6226a80 100644 --- a/.env.example +++ b/.env.example @@ -2,4 +2,7 @@ CHROME_PATH= # OpenAI API key for OpenAI model access -OPENAI_API_KEY=your-api-key-here \ No newline at end of file +OPENAI_API_KEY=your-api-key-here + +# Set to true if you want api calls to wait for tasks to complete (default is false) +PATIENT=false \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d01ecd6..34e9945 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: permissions: contents: read - packages: write + # packages: write # attestations: write id-token: write @@ -87,12 +87,8 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Log in to the Container registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to registry + run: docker login ghcr.io -u "${{ secrets.COBROWSER_PACKAGE_USER }}" --password-stdin <<< "${{ secrets.COBROWSER_PACKAGE_TOKEN }}" - name: Extract metadata (tags, labels) for Docker id: meta diff --git a/README.md b/README.md index 0a68e8a..3592ddc 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ uv run server --port 8000 ``` OPENAI_API_KEY=[your api key] CHROME_PATH=[only change this if you have a custom chrome build] +PATIENT=false # Set to true if you want api calls to wait for tasks to complete (default is false) ``` - we will be adding support for other LLM providers to power browser-use diff --git a/server/server.py b/server/server.py index 301e585..8472957 100644 --- a/server/server.py +++ b/server/server.py @@ -44,7 +44,26 @@ load_dotenv() -def init_configuration() -> Dict[str, any]: +def parse_bool_env(env_var: str, default: bool = False) -> bool: + """ + Parse a boolean environment variable. + + Args: + env_var: The environment variable name + default: Default value if not set + + Returns: + Boolean value of the environment variable + """ + value = os.environ.get(env_var) + if value is None: + return default + + # Consider various representations of boolean values + return value.lower() in ("true", "yes", "1", "y", "on") + + +def init_configuration() -> Dict[str, Any]: """ Initialize configuration from environment variables with defaults. @@ -78,6 +97,8 @@ def init_configuration() -> Dict[str, any]: "--disable-dev-shm-usage", "--remote-debugging-port=0", # Use random port to avoid conflicts ], + # Patient mode - if true, functions wait for task completion before returning + "PATIENT_MODE": parse_bool_env("PATIENT", False), } return config @@ -163,6 +184,9 @@ async def run_browser_task_async( This function executes a browser automation task with the given URL and action, and updates the task store with progress and results. + When PATIENT_MODE is enabled, the calling function will wait for this function + to complete before returning to the client. + Args: task_id: Unique identifier for the task url: URL to navigate to @@ -382,7 +406,9 @@ async def call_tool( arguments: The arguments to pass to the tool Returns: - A list of content objects to return to the client + A list of content objects to return to the client. + When PATIENT_MODE is enabled, the browser_use tool will wait for the task to complete + and return the full result immediately instead of just the task ID. Raises: ValueError: If required arguments are missing @@ -408,7 +434,7 @@ async def call_tool( } # Start task in background - asyncio.create_task( + _task = asyncio.create_task( run_browser_task_async( task_id=task_id, url=arguments["url"], @@ -420,6 +446,38 @@ async def call_tool( ) ) + # If PATIENT is set, wait for the task to complete + if CONFIG["PATIENT_MODE"]: + try: + await _task + # Return the completed task result instead of just the ID + task_data = task_store[task_id] + if task_data["status"] == "failed": + logger.error( + f"Task {task_id} failed: {task_data.get('error', 'Unknown error')}" + ) + return [ + types.TextContent( + type="text", + text=json.dumps(task_data, indent=2), + ) + ] + except Exception as e: + logger.error(f"Error in patient mode execution: {str(e)}") + traceback_str = traceback.format_exc() + # Update task store with error + task_store[task_id]["status"] = "failed" + task_store[task_id]["error"] = str(e) + task_store[task_id]["traceback"] = traceback_str + task_store[task_id]["end_time"] = datetime.now().isoformat() + # Return error information + return [ + types.TextContent( + type="text", + text=json.dumps(task_store[task_id], indent=2), + ) + ] + # Return task ID immediately with explicit sleep instruction return [ types.TextContent( @@ -497,43 +555,85 @@ async def list_tools() -> list[types.Tool]: """ List the available tools for the MCP client. + Returns different tool descriptions based on the PATIENT_MODE configuration. + When PATIENT_MODE is enabled, the browser_use tool description indicates it returns + complete results directly. When disabled, it indicates async operation. + Returns: - A list of tool definitions + A list of tool definitions appropriate for the current configuration """ - return [ - types.Tool( - name="browser_use", - description="Performs a browser action and returns a task ID for async execution", - inputSchema={ - "type": "object", - "required": ["url", "action"], - "properties": { - "url": { - "type": "string", - "description": "URL to navigate to", + patient_mode = CONFIG["PATIENT_MODE"] + + if patient_mode: + return [ + types.Tool( + name="browser_use", + description="Performs a browser action and returns the complete result directly (patient mode active)", + inputSchema={ + "type": "object", + "required": ["url", "action"], + "properties": { + "url": { + "type": "string", + "description": "URL to navigate to", + }, + "action": { + "type": "string", + "description": "Action to perform in the browser", + }, }, - "action": { - "type": "string", - "description": "Action to perform in the browser", + }, + ), + types.Tool( + name="browser_get_result", + description="Gets the result of an asynchronous browser task (not needed in patient mode as browser_use returns complete results directly)", + inputSchema={ + "type": "object", + "required": ["task_id"], + "properties": { + "task_id": { + "type": "string", + "description": "ID of the task to get results for", + } + }, + }, + ), + ] + else: + return [ + types.Tool( + name="browser_use", + description="Performs a browser action and returns a task ID for async execution", + inputSchema={ + "type": "object", + "required": ["url", "action"], + "properties": { + "url": { + "type": "string", + "description": "URL to navigate to", + }, + "action": { + "type": "string", + "description": "Action to perform in the browser", + }, }, }, - }, - ), - types.Tool( - name="browser_get_result", - description="Gets the result of an asynchronous browser task", - inputSchema={ - "type": "object", - "required": ["task_id"], - "properties": { - "task_id": { - "type": "string", - "description": "ID of the task to get results for", - } + ), + types.Tool( + name="browser_get_result", + description="Gets the result of an asynchronous browser task", + inputSchema={ + "type": "object", + "required": ["task_id"], + "properties": { + "task_id": { + "type": "string", + "description": "ID of the task to get results for", + } + }, }, - }, - ), - ] + ), + ] @app.list_resources() async def list_resources() -> list[types.Resource]: