Merge pull request #11 from richard-devbot/main

warmshao · web-flow · commit 00a0facedde6 · 2025-01-06T23:49:53.000+08:00
Upgraded Gradio UI
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ browser-use
 langchain-google-genai
 pyperclip
 gradio
-langchain-ollama
+langchain-ollama
+
diff --git a/webui.py b/webui.py
@@ -36,7 +36,6 @@
 
 from src.utils import utils
 
-
 async def run_browser_agent(
         agent_type,
         llm_provider,
@@ -55,10 +54,14 @@ async def run_browser_agent(
         max_steps,
         use_vision
 ):
-    """
-    Runs the browser agent based on user configurations.
-    """
+    # Ensure the recording directory exists
+    os.makedirs(save_recording_path, exist_ok=True)
 
+    # Get the list of existing videos before the agent runs
+    existing_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) + 
+                          glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]')))
+
+    # Run the agent
     llm = utils.get_llm_model(
         provider=llm_provider,
         model_name=llm_model_name,
@@ -67,7 +70,7 @@ async def run_browser_agent(
         api_key=llm_api_key
     )
     if agent_type == "org":
-        return await run_org_agent(
+        final_result, errors, model_actions, model_thoughts = await run_org_agent(
             llm=llm,
             headless=headless,
             disable_security=disable_security,
@@ -79,7 +82,7 @@ async def run_browser_agent(
             use_vision=use_vision
         )
     elif agent_type == "custom":
-        return await run_custom_agent(
+        final_result, errors, model_actions, model_thoughts = await run_custom_agent(
             llm=llm,
             use_own_browser=use_own_browser,
             headless=headless,
@@ -95,6 +98,16 @@ async def run_browser_agent(
     else:
         raise ValueError(f"Invalid agent type: {agent_type}")
 
+    # Get the list of videos after the agent runs
+    new_videos = set(glob.glob(os.path.join(save_recording_path, '*.[mM][pP]4')) + 
+                     glob.glob(os.path.join(save_recording_path, '*.[wW][eE][bB][mM]')))
+
+    # Find the newly created video
+    latest_video = None
+    if new_videos - existing_videos:
+        latest_video = list(new_videos - existing_videos)[0]  # Get the first new video
+
+    return final_result, errors, model_actions, model_thoughts, latest_video
 
 async def run_org_agent(
         llm,
@@ -137,7 +150,6 @@ async def run_org_agent(
     await browser.close()
     return final_result, errors, model_actions, model_thoughts
 
-
 async def run_custom_agent(
         llm,
         use_own_browser,
@@ -228,88 +240,228 @@ async def run_custom_agent(
     return final_result, errors, model_actions, model_thoughts
 
 
-def main():
-    parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
-    parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
-    parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
-    args = parser.parse_args()
+import argparse
+import gradio as gr
+from gradio.themes import Base, Default, Soft, Monochrome, Glass, Origin, Citrus, Ocean
+import os, glob
 
-    js_func = """
-        function refresh() {
-            const url = new URL(window.location);
+# Define the theme map globally
+theme_map = {
+    "Default": Default(),
+    "Soft": Soft(),
+    "Monochrome": Monochrome(),
+    "Glass": Glass(),
+    "Origin": Origin(),
+    "Citrus": Citrus(),
+    "Ocean": Ocean()
+}
 
-            if (url.searchParams.get('__theme') !== 'dark') {
-                url.searchParams.set('__theme', 'dark');
-                window.location.href = url.href;
-            }
-        }
-        """
+def create_ui(theme_name="Ocean"):
+    css = """
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+        padding-top: 20px !important;
+    }
+    .header-text {
+        text-align: center;
+        margin-bottom: 30px;
+    }
+    .theme-section {
+        margin-bottom: 20px;
+        padding: 15px;
+        border-radius: 10px;
+    }
+    """
 
-    # Gradio UI setup
-    with gr.Blocks(title="Browser Use WebUI", theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")]),
-                   js=js_func) as demo:
-        gr.Markdown("<center><h1>Browser Use WebUI</h1></center>")
-        with gr.Row():
-            agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom")
-            max_steps = gr.Number(label="max run steps", value=100)
-            use_vision = gr.Checkbox(label="use vision", value=True)
+    js = """
+    function refresh() {
+        const url = new URL(window.location);
+        if (url.searchParams.get('__theme') !== 'dark') {
+            url.searchParams.set('__theme', 'dark');
+            window.location.href = url.href;
+        }
+    }
+    """
+    
+    with gr.Blocks(title="Browser Use WebUI", theme=theme_map[theme_name], css=css, js=js) as demo:
         with gr.Row():
-            llm_provider = gr.Dropdown(
-                ["anthropic", "openai", "gemini", "azure_openai", "deepseek", "ollama"], label="LLM Provider",
-                value="gemini"
+            gr.Markdown(
+                """
+                # 🌐 Browser Use WebUI
+                ### Control your browser with AI assistance
+                """,
+                elem_classes=["header-text"]
             )
-            llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp")
-            llm_temperature = gr.Number(label="LLM Temperature", value=1.0)
-        with gr.Row():
-            llm_base_url = gr.Textbox(label="LLM Base URL")
-            llm_api_key = gr.Textbox(label="LLM API Key", type="password")
-
-        with gr.Accordion("Browser Settings", open=False):
-            use_own_browser = gr.Checkbox(label="Use Own Browser", value=False)
-            headless = gr.Checkbox(label="Headless", value=False)
-            disable_security = gr.Checkbox(label="Disable Security", value=True)
-            with gr.Row():
-                window_w = gr.Number(label="Window Width", value=1920)
-                window_h = gr.Number(label="Window Height", value=1080)
-            save_recording_path = gr.Textbox(label="Save Recording Path", placeholder="e.g. ./tmp/record_videos",
-                                             value="./tmp/record_videos")
-        with gr.Accordion("Task Settings", open=True):
-            task = gr.Textbox(label="Task", lines=10,
-                              value="go to google.com and type 'OpenAI' click search and give me the first url")
-            add_infos = gr.Textbox(label="Additional Infos(Optional): Hints to help LLM complete Task", lines=5)
-
-        run_button = gr.Button("Run Agent", variant="primary")
-        with gr.Column():
-            final_result_output = gr.Textbox(label="Final Result", lines=5)
-            errors_output = gr.Textbox(label="Errors", lines=5, )
-            model_actions_output = gr.Textbox(label="Model Actions", lines=5)
-            model_thoughts_output = gr.Textbox(label="Model Thoughts", lines=5)
+        
+        with gr.Tabs() as tabs:
+            with gr.TabItem("🤖 Agent Settings", id=1):
+                with gr.Group():
+                    agent_type = gr.Radio(
+                        ["org", "custom"],
+                        label="Agent Type",
+                        value="custom",
+                        info="Select the type of agent to use"
+                    )
+                    max_steps = gr.Slider(
+                        minimum=1,
+                        maximum=200,
+                        value=100,
+                        step=1,
+                        label="Max Run Steps",
+                        info="Maximum number of steps the agent will take"
+                    )
+                    use_vision = gr.Checkbox(
+                        label="Use Vision",
+                        value=True,
+                        info="Enable visual processing capabilities"
+                    )
+
+            with gr.TabItem("🔧 LLM Configuration", id=2):
+                with gr.Group():
+                    llm_provider = gr.Dropdown(
+                        ["anthropic", "openai", "gemini", "azure_openai", "deepseek", "ollama"],
+                        label="LLM Provider",
+                        value="gemini",
+                        info="Select your preferred language model provider"
+                    )
+                    llm_model_name = gr.Textbox(
+                        label="Model Name",
+                        value="gemini-2.0-flash-exp",
+                        info="Specify the model to use"
+                    )
+                    llm_temperature = gr.Slider(
+                        minimum=0.0,
+                        maximum=2.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Temperature",
+                        info="Controls randomness in model outputs"
+                    )
+                    with gr.Row():
+                        llm_base_url = gr.Textbox(
+                            label="Base URL",
+                            info="API endpoint URL (if required)"
+                        )
+                        llm_api_key = gr.Textbox(
+                            label="API Key",
+                            type="password",
+                            info="Your API key"
+                        )
+
+            with gr.TabItem("🌐 Browser Settings", id=3):
+                with gr.Group():
+                    with gr.Row():
+                        use_own_browser = gr.Checkbox(
+                            label="Use Own Browser",
+                            value=False,
+                            info="Use your existing browser instance"
+                        )
+                        headless = gr.Checkbox(
+                            label="Headless Mode",
+                            value=False,
+                            info="Run browser without GUI"
+                        )
+                        disable_security = gr.Checkbox(
+                            label="Disable Security",
+                            value=True,
+                            info="Disable browser security features"
+                        )
+                    
+                    with gr.Row():
+                        window_w = gr.Number(
+                            label="Window Width",
+                            value=1920,
+                            info="Browser window width"
+                        )
+                        window_h = gr.Number(
+                            label="Window Height",
+                            value=1080,
+                            info="Browser window height"
+                        )
+                    
+                    save_recording_path = gr.Textbox(
+                        label="Recording Path",
+                        placeholder="e.g. ./tmp/record_videos",
+                        value="./tmp/record_videos",
+                        info="Path to save browser recordings"
+                    )
+
+            with gr.TabItem("📝 Task Settings", id=4):
+                task = gr.Textbox(
+                    label="Task Description",
+                    lines=4,
+                    placeholder="Enter your task here...",
+                    value="go to google.com and type 'OpenAI' click search and give me the first url",
+                    info="Describe what you want the agent to do"
+                )
+                add_infos = gr.Textbox(
+                    label="Additional Information",
+                    lines=3,
+                    placeholder="Add any helpful context or instructions...",
+                    info="Optional hints to help the LLM complete the task"
+                )
+
+                with gr.Row():
+                    run_button = gr.Button("▶️ Run Agent", variant="primary", scale=2)
+                    stop_button = gr.Button("⏹️ Stop", variant="stop", scale=1)
+
+            with gr.TabItem("🎬 Recordings", id=5):
+                recording_display = gr.Video(label="Latest Recording")
+
+                with gr.Group():
+                    gr.Markdown("### Results")
+                    with gr.Row():
+                        with gr.Column():
+                            final_result_output = gr.Textbox(
+                                label="Final Result",
+                                lines=3,
+                                show_label=True
+                            )
+                        with gr.Column():
+                            errors_output = gr.Textbox(
+                                label="Errors",
+                                lines=3,
+                                show_label=True
+                            )
+                    with gr.Row():
+                        with gr.Column():
+                            model_actions_output = gr.Textbox(
+                                label="Model Actions",
+                                lines=3,
+                                show_label=True
+                            )
+                        with gr.Column():
+                            model_thoughts_output = gr.Textbox(
+                                label="Model Thoughts",
+                                lines=3,
+                                show_label=True
+                            )
 
+        # Run button click handler
         run_button.click(
             fn=run_browser_agent,
             inputs=[
-                agent_type,
-                llm_provider,
-                llm_model_name,
-                llm_temperature,
-                llm_base_url,
-                llm_api_key,
-                use_own_browser,
-                headless,
-                disable_security,
-                window_w,
-                window_h,
-                save_recording_path,
-                task,
-                add_infos,
-                max_steps,
-                use_vision
+                agent_type, llm_provider, llm_model_name, llm_temperature,
+                llm_base_url, llm_api_key, use_own_browser, headless,
+                disable_security, window_w, window_h, save_recording_path,
+                task, add_infos, max_steps, use_vision
             ],
-            outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output],
+            outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output, recording_display]
         )
 
-    demo.launch(server_name=args.ip, server_port=args.port)
+    return demo
+
+def main():
+    parser = argparse.ArgumentParser(description="Gradio UI for Browser Agent")
+    parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to bind to")
+    parser.add_argument("--port", type=int, default=7788, help="Port to listen on")
+    parser.add_argument("--theme", type=str, default="Ocean", choices=theme_map.keys(), help="Theme to use for the UI")
+    parser.add_argument("--dark-mode", action="store_true", help="Enable dark mode")
+    args = parser.parse_args()
 
+    demo = create_ui(theme_name=args.theme)
+    demo.launch(server_name=args.ip, server_port=args.port)
 
 if __name__ == '__main__':
     main()