browser-use · Pradeep-Gopi-E · Nov 2, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py
@@ -964,18 +964,12 @@ async def handle_clear(webui_manager: WebuiManager):
             interactive=True
         ),
     }
-
-
-# --- Tab Creation Function ---
-
-
-def create_browser_use_agent_tab(webui_manager: WebuiManager):
+def create_browser_use_agent_tab(webui_manager: WebuiManager, speech_js: str):
     """
     Create the run agent tab, defining UI, state, and handlers.
     """
     webui_manager.init_browser_use_agent()
-
-    # --- Define UI Components ---
+    # --- 2. Define UI Components ---
     tab_components = {}
     with gr.Column():
         chatbot = gr.Chatbot(
@@ -986,13 +980,24 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager):
             height=600,
             show_copy_button=True,
         )
-        user_input = gr.Textbox(
-            label="Your Task or Response",
-            placeholder="Enter your task here or provide assistance when asked.",
-            lines=3,
-            interactive=True,
-            elem_id="user_input",
-        )
+
+        # --- NEW: Place button and textbox together ---
+        with gr.Row():
+            user_input = gr.Textbox(
+                label="Your Task or Response",
+                placeholder="Enter your task, or Speak.",
+                lines=3,
+                interactive=True,
+                elem_id="user_input", # Crucial ID for the JS
+                scale=5 # Make textbox bigger
+            )
+            # --- NEW: This is the button ---
+            speech_to_text_button = gr.Button(
+                "🎙️", 
+                elem_id="speech_btn", # Crucial ID for the JS
+                scale=1
+            )
+
         with gr.Row():
             stop_button = gr.Button(
                 "⏹️ Stop", interactive=False, variant="stop", scale=2
@@ -1021,11 +1026,12 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager):
                 type="filepath",
             )
 
-    # --- Store Components in Manager ---
+    # --- 3. NEW: Store Components in Manager (add the new button) ---
     tab_components.update(
         dict(
             chatbot=chatbot,
             user_input=user_input,
+            speech_to_text_button=speech_to_text_button, # <-- ADDED THIS
             clear_button=clear_button,
             run_button=run_button,
             stop_button=stop_button,
@@ -1044,8 +1050,17 @@ def create_browser_use_agent_tab(webui_manager: WebuiManager):
     )  # Get all components known to manager
     run_tab_outputs = list(tab_components.values())
 
+    # --- 4. NEW: Connect the Speech Button to the JavaScript ---
+    speech_to_text_button.click(
+        fn=None,  # We don't run any Python code
+        inputs=None,
+        outputs=None,
+        js=speech_js  # We run this JavaScript code instead
+    )
+
+    # --- Your existing wrapper functions (UNCHANGED) ---
     async def submit_wrapper(
-            components_dict: Dict[Component, Any],
+        components_dict: Dict[Component, Any],
     ) -> AsyncGenerator[Dict[Component, Any], None]:
         """Wrapper for handle_submit that yields its results."""
         async for update in handle_submit(webui_manager, components_dict):

diff --git a/src/webui/interface.py b/src/webui/interface.py
@@ -18,7 +18,76 @@
     "Base": gr.themes.Base()
 }
 
+js_speech_function = """
+    () => {
+        // --- THIS IS THE UPDATED PART ---
+        // We will try multiple ways to find the elements, just in case
+        // Gradio has rendered them differently.
 
+        // Try to find the button:
+        // 1. A <button> element *inside* an element with id="speech_btn"
+        // 2. A <button> element *with* the id="speech_btn"
+        const btn = document.querySelector("#speech_btn button") || 
+                    document.querySelector("button#speech_btn");
+
+        // Try to find the textbox:
+        // 1. A <textarea> *inside* an element with id="user_input"
+        // 2. A <textarea> *with* the id="user_input"
+        const textarea = document.querySelector("#user_input textarea") || 
+                         document.querySelector("textarea#user_input");
+
+        if (!textarea || !btn) {
+            alert("Error: Could not find UI elements for speech recognition.");
+            return;
+        }
+
+        // 1. Check for browser support
+        const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+        if (!SpeechRecognition) {
+            alert("Your browser does not support the Web Speech API. Try Chrome or Edge.");
+            return;
+        }
+
+        const recognition = new SpeechRecognition();
+        recognition.interimResults = false;
+        recognition.lang = 'en-US'; // You can change this (e.g., 'es-ES')
+
+        // 2. Update UI during recognition
+        recognition.onstart = () => {
+            btn.textContent = "🎙️ Listening...";
+            btn.disabled = true;
+            textarea.placeholder = "Listening...";
+        };
+
+        recognition.onend = () => {
+            btn.textContent = "🎙️";
+            btn.disabled = false;
+            textarea.placeholder = "Enter your task, or click 'Speak' to use voice.";
+        };
+
+        recognition.onerror = (event) => {
+            btn.textContent = "🎙️";
+            btn.disabled = false;
+            textarea.placeholder = "Error: " + event.error;
+            console.error("Speech recognition error:", event.error);
+        };
+
+        // 3. Handle the result
+        recognition.onresult = (event) => {
+            const transcript = event.results[0][0].transcript;
+            textarea.value = transcript; // Set the visual value
+
+            // This is the "magic" part:
+            // We must simulate a user "input" event to make Gradio's
+            // backend state (components dictionary) update.
+            const inputEvent = new Event('input', { bubbles: true });
+            textarea.dispatchEvent(inputEvent);
+        };
+
+        // 4. Start recognition
+        recognition.start();
+    }
+    """
 def create_ui(theme_name="Ocean"):
     css = """
     .gradio-container {
@@ -76,7 +145,7 @@ def create_ui(theme_name="Ocean"):
                 create_browser_settings_tab(ui_manager)
 
             with gr.TabItem("🤖 Run Agent"):
-                create_browser_use_agent_tab(ui_manager)
+                create_browser_use_agent_tab(ui_manager, js_speech_function)
 
             with gr.TabItem("🎁 Agent Marketplace"):
                 gr.Markdown(