|
| 1 | +# Copyright 2025 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import asyncio |
| 16 | +import os |
| 17 | + |
| 18 | +from google import genai |
| 19 | +from google.genai.types import ( |
| 20 | + ComputerUse, |
| 21 | + Content, |
| 22 | + Environment, |
| 23 | + FunctionResponse, |
| 24 | + FunctionResponseBlob, |
| 25 | + GenerateContentConfig, |
| 26 | + Part, |
| 27 | + Tool, |
| 28 | +) |
| 29 | +from playwright.async_api import Page, async_playwright |
| 30 | + |
| 31 | +# --- CONFIGURATION --- |
| 32 | +# Load configuration from environment variables for best practice. |
| 33 | +PROJECT_ID = os.environ.get("GOOGLE_PROJECT_ID") |
| 34 | +LOCATION = os.environ.get("GOOGLE_LOCATION", "global") |
| 35 | +MODEL_ID = os.environ.get("MODEL_ID", "gemini-2.5-computer-use-preview-10-2025") |
| 36 | + |
| 37 | + |
| 38 | +# --- HELPER FUNCTIONS --- |
| 39 | + |
| 40 | + |
| 41 | +def normalize_x(x: int, screen_width: int) -> int: |
| 42 | + """Convert normalized x coordinate (0-1000) to actual pixel coordinate.""" |
| 43 | + return int(x / 1000 * screen_width) |
| 44 | + |
| 45 | + |
| 46 | +def normalize_y(y: int, screen_height: int) -> int: |
| 47 | + """Convert normalized y coordinate (0-1000) to actual pixel coordinate.""" |
| 48 | + return int(y / 1000 * screen_height) |
| 49 | + |
| 50 | + |
| 51 | +async def execute_function_calls( |
| 52 | + response, page: Page, screen_width: int, screen_height: int |
| 53 | +) -> tuple[str, list[tuple[str, str]]]: |
| 54 | + """Extracts and executes function calls from the model response.""" |
| 55 | + await asyncio.sleep(0.1) |
| 56 | + |
| 57 | + function_calls = [ |
| 58 | + part.function_call |
| 59 | + for part in response.candidates[0].content.parts |
| 60 | + if hasattr(part, "function_call") and part.function_call |
| 61 | + ] |
| 62 | + |
| 63 | + thoughts = [ |
| 64 | + part.text |
| 65 | + for part in response.candidates[0].content.parts |
| 66 | + if hasattr(part, "text") and part.text |
| 67 | + ] |
| 68 | + |
| 69 | + if thoughts: |
| 70 | + print(f"🤔 Model Reasoning: {' '.join(thoughts)}") |
| 71 | + |
| 72 | + if not function_calls: |
| 73 | + return "NO_ACTION", [] |
| 74 | + |
| 75 | + results = [] |
| 76 | + for function_call in function_calls: |
| 77 | + result = None |
| 78 | + print(f"⚡ Executing Action: {function_call.name}") |
| 79 | + try: |
| 80 | + if function_call.name == "open_web_browser": |
| 81 | + result = "success" # The browser is already open |
| 82 | + elif function_call.name == "navigate": |
| 83 | + await page.goto(function_call.args["url"]) |
| 84 | + result = "success" |
| 85 | + elif function_call.name == "click_at": |
| 86 | + actual_x = normalize_x(function_call.args["x"], screen_width) |
| 87 | + actual_y = normalize_y(function_call.args["y"], screen_height) |
| 88 | + await page.mouse.click(actual_x, actual_y) |
| 89 | + result = "success" |
| 90 | + elif function_call.name == "type_text_at": |
| 91 | + text_to_type = function_call.args["text"] |
| 92 | + print(f'[DEBUG] Typing text: "{text_to_type}"') |
| 93 | + actual_x = normalize_x(function_call.args["x"], screen_width) |
| 94 | + actual_y = normalize_y(function_call.args["y"], screen_height) |
| 95 | + await page.mouse.click(actual_x, actual_y) |
| 96 | + await asyncio.sleep(0.1) |
| 97 | + await page.keyboard.type(text_to_type) |
| 98 | + if function_call.args.get("press_enter", False): |
| 99 | + await page.keyboard.press("Enter") |
| 100 | + result = "success" |
| 101 | + else: |
| 102 | + result = "unknown_function" |
| 103 | + except Exception as e: |
| 104 | + print(f"❗️ Error executing {function_call.name}: {e}") |
| 105 | + result = f"error: {e!s}" |
| 106 | + results.append((function_call.name, result)) |
| 107 | + return "CONTINUE", results |
| 108 | + |
| 109 | + |
| 110 | +# --- THE AGENT LOOP --- |
| 111 | + |
| 112 | + |
| 113 | +async def agent_loop(initial_prompt: str, max_turns: int = 5) -> None: |
| 114 | + """Main agent loop for local execution with a browser.""" |
| 115 | + if not PROJECT_ID: |
| 116 | + raise ValueError("GOOGLE_PROJECT_ID environment variable not set.") |
| 117 | + |
| 118 | + client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION) |
| 119 | + |
| 120 | + browser = None |
| 121 | + try: |
| 122 | + async with async_playwright() as p: |
| 123 | + # MODIFIED: Launch browser in a try...finally block |
| 124 | + browser = await p.chromium.launch(headless=False) |
| 125 | + page = await browser.new_page() |
| 126 | + sw, sh = 960, 1080 |
| 127 | + await page.set_viewport_size({"width": sw, "height": sh}) |
| 128 | + |
| 129 | + print(f"🎬 Starting Agent Loop with prompt: '{initial_prompt}'") |
| 130 | + # ... (rest of the loop is fine and remains the same) ... |
| 131 | + config = GenerateContentConfig( |
| 132 | + tools=[ |
| 133 | + Tool( |
| 134 | + computer_use=ComputerUse( |
| 135 | + environment=Environment.ENVIRONMENT_BROWSER, |
| 136 | + ) |
| 137 | + ) |
| 138 | + ], |
| 139 | + ) |
| 140 | + screenshot = await page.screenshot() |
| 141 | + contents = [ |
| 142 | + Content( |
| 143 | + role="user", |
| 144 | + parts=[ |
| 145 | + Part(text=initial_prompt), |
| 146 | + Part.from_bytes(data=screenshot, mime_type="image/png"), |
| 147 | + ], |
| 148 | + ) |
| 149 | + ] |
| 150 | + for turn in range(max_turns): |
| 151 | + print(f"\n--- 🔁 Turn {turn + 1} ---") |
| 152 | + print(f"[DEBUG] Current URL: {page.url}") |
| 153 | + |
| 154 | + response = client.models.generate_content( |
| 155 | + model=MODEL_ID, contents=contents, config=config |
| 156 | + ) |
| 157 | + |
| 158 | + if not response.candidates: |
| 159 | + print("❗️ Model returned no candidates. Terminating loop.") |
| 160 | + print("Full Response:", response) |
| 161 | + break |
| 162 | + |
| 163 | + print( |
| 164 | + f"[DEBUG] Model Finish Reason: {response.candidates[0].finish_reason}" |
| 165 | + ) |
| 166 | + contents.append(response.candidates[0].content) |
| 167 | + print("[DEBUG] Appended model response to history.") |
| 168 | + |
| 169 | + if not any( |
| 170 | + hasattr(part, "function_call") |
| 171 | + for part in response.candidates[0].content.parts |
| 172 | + ): |
| 173 | + final_text = "".join( |
| 174 | + part.text |
| 175 | + for part in response.candidates[0].content.parts |
| 176 | + if hasattr(part, "text") and part.text is not None |
| 177 | + ) |
| 178 | + if final_text: |
| 179 | + print(f"✅ Agent Finished: {final_text}") |
| 180 | + break |
| 181 | + |
| 182 | + status, execution_results = await execute_function_calls( |
| 183 | + response, page, sw, sh |
| 184 | + ) |
| 185 | + print( |
| 186 | + f"[DEBUG] Execution Results: status='{status}', results={execution_results}" |
| 187 | + ) |
| 188 | + |
| 189 | + if status == "NO_ACTION": |
| 190 | + continue |
| 191 | + |
| 192 | + function_response_parts = [] |
| 193 | + for name, result in execution_results: |
| 194 | + screenshot = await page.screenshot() |
| 195 | + current_url = page.url |
| 196 | + function_response_parts.append( |
| 197 | + FunctionResponse( |
| 198 | + name=name, |
| 199 | + response={"url": current_url}, |
| 200 | + parts=[ |
| 201 | + Part( |
| 202 | + inline_data=FunctionResponseBlob( |
| 203 | + mime_type="image/png", data=screenshot |
| 204 | + ) |
| 205 | + ) |
| 206 | + ], |
| 207 | + ) |
| 208 | + ) |
| 209 | + contents.append(Content(role="user", parts=function_response_parts)) |
| 210 | + print(f"📝 State captured. History now has {len(contents)} messages.") |
| 211 | + |
| 212 | + finally: |
| 213 | + if browser: |
| 214 | + await browser.close() |
| 215 | + print("\n--- Browser closed. ---") |
| 216 | + |
| 217 | + |
| 218 | +# --- SCRIPT ENTRY POINT --- |
| 219 | +if __name__ == "__main__": |
| 220 | + prompt = "Navigate to the Google Store and find the page of 'Pixel 10'." |
| 221 | + |
| 222 | + asyncio.run(agent_loop(prompt)) |
0 commit comments