Skip to content

Commit 7728637

Browse files
author
katiue
committed
Merge remote-tracking branch 'upstream/main' into stream_function
2 parents 92069a5 + 3de7e34 commit 7728637

File tree

4 files changed

+349
-33
lines changed

4 files changed

+349
-33
lines changed

src/agent/custom_agent.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
import pdb
1010
import traceback
1111
from typing import Optional, Type
12+
from PIL import Image, ImageDraw, ImageFont
13+
import os
14+
import base64
15+
import io
1216

1317
from browser_use.agent.prompts import SystemPrompt
1418
from browser_use.agent.service import Agent
@@ -227,6 +231,119 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
227231
)
228232
if state:
229233
self._make_history_item(model_output, state, result)
234+
def create_history_gif(
235+
self,
236+
output_path: str = 'agent_history.gif',
237+
duration: int = 3000,
238+
show_goals: bool = True,
239+
show_task: bool = True,
240+
show_logo: bool = False,
241+
font_size: int = 40,
242+
title_font_size: int = 56,
243+
goal_font_size: int = 44,
244+
margin: int = 40,
245+
line_spacing: float = 1.5,
246+
) -> None:
247+
"""Create a GIF from the agent's history with overlaid task and goal text."""
248+
if not self.history.history:
249+
logger.warning('No history to create GIF from')
250+
return
251+
252+
images = []
253+
# if history is empty or first screenshot is None, we can't create a gif
254+
if not self.history.history or not self.history.history[0].state.screenshot:
255+
logger.warning('No history or first screenshot to create GIF from')
256+
return
257+
258+
# Try to load nicer fonts
259+
try:
260+
# Try different font options in order of preference
261+
font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana']
262+
font_loaded = False
263+
264+
for font_name in font_options:
265+
try:
266+
import platform
267+
if platform.system() == "Windows":
268+
# Need to specify the abs font path on Windows
269+
font_name = os.path.join(os.getenv("WIN_FONT_DIR", "C:\\Windows\\Fonts"), font_name + ".ttf")
270+
regular_font = ImageFont.truetype(font_name, font_size)
271+
title_font = ImageFont.truetype(font_name, title_font_size)
272+
goal_font = ImageFont.truetype(font_name, goal_font_size)
273+
font_loaded = True
274+
break
275+
except OSError:
276+
continue
277+
278+
if not font_loaded:
279+
raise OSError('No preferred fonts found')
280+
281+
except OSError:
282+
regular_font = ImageFont.load_default()
283+
title_font = ImageFont.load_default()
284+
285+
goal_font = regular_font
286+
287+
# Load logo if requested
288+
logo = None
289+
if show_logo:
290+
try:
291+
logo = Image.open('./static/browser-use.png')
292+
# Resize logo to be small (e.g., 40px height)
293+
logo_height = 150
294+
aspect_ratio = logo.width / logo.height
295+
logo_width = int(logo_height * aspect_ratio)
296+
logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
297+
except Exception as e:
298+
logger.warning(f'Could not load logo: {e}')
299+
300+
# Create task frame if requested
301+
if show_task and self.task:
302+
task_frame = self._create_task_frame(
303+
self.task,
304+
self.history.history[0].state.screenshot,
305+
title_font,
306+
regular_font,
307+
logo,
308+
line_spacing,
309+
)
310+
images.append(task_frame)
311+
312+
# Process each history item
313+
for i, item in enumerate(self.history.history, 1):
314+
if not item.state.screenshot:
315+
continue
316+
317+
# Convert base64 screenshot to PIL Image
318+
img_data = base64.b64decode(item.state.screenshot)
319+
image = Image.open(io.BytesIO(img_data))
320+
321+
if show_goals and item.model_output:
322+
image = self._add_overlay_to_image(
323+
image=image,
324+
step_number=i,
325+
goal_text=item.model_output.current_state.thought,
326+
regular_font=regular_font,
327+
title_font=title_font,
328+
margin=margin,
329+
logo=logo,
330+
)
331+
332+
images.append(image)
333+
334+
if images:
335+
# Save the GIF
336+
images[0].save(
337+
output_path,
338+
save_all=True,
339+
append_images=images[1:],
340+
duration=duration,
341+
loop=0,
342+
optimize=False,
343+
)
344+
logger.info(f'Created GIF at {output_path}')
345+
else:
346+
logger.warning('No images found in history to create GIF')
230347

231348
async def run(self, max_steps: int = 100) -> AgentHistoryList:
232349
"""Execute the task with maximum number of steps"""
@@ -283,3 +400,6 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList:
283400

284401
if not self.injected_browser and self.browser:
285402
await self.browser.close()
403+
404+
if self.generate_gif:
405+
self.create_history_gif()

src/browser/custom_browser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,4 +123,4 @@ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
123123
return browser
124124
except Exception as e:
125125
logger.error(f'Failed to initialize Playwright browser: {str(e)}')
126-
raise
126+
raise

tests/test_browser_use.py

Lines changed: 133 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# @Author : wenshao
44
# @ProjectName: browser-use-webui
55
# @FileName: test_browser_use.py
6+
import pdb
67

78
from dotenv import load_dotenv
89

@@ -28,20 +29,29 @@ async def test_browser_use_org():
2829
BrowserContextWindowSize,
2930
)
3031

32+
# llm = utils.get_llm_model(
33+
# provider="azure_openai",
34+
# model_name="gpt-4o",
35+
# temperature=0.8,
36+
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
37+
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
38+
# )
39+
3140
llm = utils.get_llm_model(
32-
provider="azure_openai",
33-
model_name="gpt-4o",
34-
temperature=0.8,
35-
base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
36-
api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
41+
provider="deepseek",
42+
model_name="deepseek-chat",
43+
temperature=0.8
3744
)
3845

3946
window_w, window_h = 1920, 1080
47+
use_vision = False
48+
chrome_path = os.getenv("CHROME_PATH", None)
4049

4150
browser = Browser(
4251
config=BrowserConfig(
4352
headless=False,
4453
disable_security=True,
54+
chrome_instance_path=chrome_path,
4555
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
4656
)
4757
)
@@ -59,6 +69,7 @@ async def test_browser_use_org():
5969
task="go to google.com and type 'OpenAI' click search and give me the first url",
6070
llm=llm,
6171
browser_context=browser_context,
72+
use_vision=use_vision
6273
)
6374
history: AgentHistoryList = await agent.run(max_steps=10)
6475

@@ -208,6 +219,122 @@ async def test_browser_use_custom():
208219
await browser.close()
209220

210221

222+
async def test_browser_use_custom_v2():
223+
from browser_use.browser.context import BrowserContextWindowSize
224+
from browser_use.browser.browser import BrowserConfig
225+
from playwright.async_api import async_playwright
226+
227+
from src.agent.custom_agent import CustomAgent
228+
from src.agent.custom_prompts import CustomSystemPrompt
229+
from src.browser.custom_browser import CustomBrowser
230+
from src.browser.custom_context import BrowserContextConfig
231+
from src.controller.custom_controller import CustomController
232+
233+
window_w, window_h = 1920, 1080
234+
235+
# llm = utils.get_llm_model(
236+
# provider="azure_openai",
237+
# model_name="gpt-4o",
238+
# temperature=0.8,
239+
# base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""),
240+
# api_key=os.getenv("AZURE_OPENAI_API_KEY", ""),
241+
# )
242+
243+
# llm = utils.get_llm_model(
244+
# provider="gemini",
245+
# model_name="gemini-2.0-flash-exp",
246+
# temperature=1.0,
247+
# api_key=os.getenv("GOOGLE_API_KEY", "")
248+
# )
249+
250+
llm = utils.get_llm_model(
251+
provider="deepseek",
252+
model_name="deepseek-chat",
253+
temperature=0.8
254+
)
255+
256+
# llm = utils.get_llm_model(
257+
# provider="ollama", model_name="qwen2.5:7b", temperature=0.8
258+
# )
259+
260+
controller = CustomController()
261+
use_own_browser = True
262+
disable_security = True
263+
use_vision = False # Set to False when using DeepSeek
264+
tool_call_in_content = True # Set to True when using Ollama
265+
max_actions_per_step = 1
266+
playwright = None
267+
browser = None
268+
browser_context = None
269+
270+
try:
271+
if use_own_browser:
272+
chrome_path = os.getenv("CHROME_PATH", None)
273+
if chrome_path == "":
274+
chrome_path = None
275+
else:
276+
chrome_path = None
277+
browser = CustomBrowser(
278+
config=BrowserConfig(
279+
headless=False,
280+
disable_security=disable_security,
281+
chrome_instance_path=chrome_path,
282+
extra_chromium_args=[f"--window-size={window_w},{window_h}"],
283+
)
284+
)
285+
browser_context = await browser.new_context(
286+
config=BrowserContextConfig(
287+
trace_path="./tmp/traces",
288+
save_recording_path="./tmp/record_videos",
289+
no_viewport=False,
290+
browser_window_size=BrowserContextWindowSize(
291+
width=window_w, height=window_h
292+
),
293+
)
294+
)
295+
agent = CustomAgent(
296+
task="go to google.com and type 'OpenAI' click search and give me the first url",
297+
add_infos="", # some hints for llm to complete the task
298+
llm=llm,
299+
browser=browser,
300+
browser_context=browser_context,
301+
controller=controller,
302+
system_prompt_class=CustomSystemPrompt,
303+
use_vision=use_vision,
304+
tool_call_in_content=tool_call_in_content,
305+
max_actions_per_step=max_actions_per_step
306+
)
307+
history: AgentHistoryList = await agent.run(max_steps=10)
308+
309+
print("Final Result:")
310+
pprint(history.final_result(), indent=4)
311+
312+
print("\nErrors:")
313+
pprint(history.errors(), indent=4)
314+
315+
# e.g. xPaths the model clicked on
316+
print("\nModel Outputs:")
317+
pprint(history.model_actions(), indent=4)
318+
319+
print("\nThoughts:")
320+
pprint(history.model_thoughts(), indent=4)
321+
# close browser
322+
except Exception:
323+
import traceback
324+
325+
traceback.print_exc()
326+
finally:
327+
# 显式关闭持久化上下文
328+
if browser_context:
329+
await browser_context.close()
330+
331+
# 关闭 Playwright 对象
332+
if playwright:
333+
await playwright.stop()
334+
if browser:
335+
await browser.close()
336+
211337
if __name__ == "__main__":
212338
# asyncio.run(test_browser_use_org())
213-
asyncio.run(test_browser_use_custom())
339+
# asyncio.run(test_browser_use_custom())
340+
asyncio.run(test_browser_use_custom_v2())

0 commit comments

Comments
 (0)