Better skills, better web browser

KillianLucas · KillianLucas · commit 8c5e504534e9 · 2024-09-04T23:19:41.000-07:00
diff --git a/interpreter/core/async_core.py b/interpreter/core/async_core.py
@@ -16,6 +16,8 @@
 
 from .core import OpenInterpreter
 
+last_start_time = 0
+
 try:
     import janus
     import uvicorn
@@ -763,10 +765,9 @@ async def openai_compatible_generator():
 
     @router.post("/openai/chat/completions")
     async def chat_completion(request: ChatCompletionRequest):
-        # Convert to LMC
-
-        async_interpreter.stop_event.set()
+        global last_start_time
 
+        # Convert to LMC
         last_message = request.messages[-1]
 
         if last_message.role != "user":
@@ -776,11 +777,11 @@ async def chat_completion(request: ChatCompletionRequest):
             # Handle special STOP token
             return
 
-        if last_message.content == "{CONTEXT_MODE_ON}":
+        if last_message.content in ["{CONTEXT_MODE_ON}", "{REQUIRE_START_ON}"]:
             async_interpreter.context_mode = True
             return
 
-        if last_message.content == "{CONTEXT_MODE_OFF}":
+        if last_message.content in ["{CONTEXT_MODE_OFF}", "{REQUIRE_START_OFF}"]:
             async_interpreter.context_mode = False
             return
 
@@ -826,12 +827,30 @@ async def chat_completion(request: ChatCompletionRequest):
         if async_interpreter.context_mode:
             # In context mode, we only respond if we recieved a {START} message
             # Otherwise, we're just accumulating context
-            if last_message.content != "{START}":
-                return
-            if async_interpreter.messages[-1]["content"] == "{START}":
+            if last_message.content == "{START}":
+                if async_interpreter.messages[-1]["content"] == "{START}":
+                    # Remove that {START} message that would have just been added
+                    async_interpreter.messages = async_interpreter.messages[:-1]
+                last_start_time = time.time()
+            else:
+                # Check if we're within 6 seconds of last_start_time
+                current_time = time.time()
+                if current_time - last_start_time <= 6:
+                    # Continue processing
+                    pass
+                else:
+                    # More than 6 seconds have passed, so return
+                    return
+
+        else:
+            if last_message.content == "{START}":
+                # This just sometimes happens I guess
                 # Remove that {START} message that would have just been added
                 async_interpreter.messages = async_interpreter.messages[:-1]
+                return
 
+        async_interpreter.stop_event.set()
+        time.sleep(0.1)
         async_interpreter.stop_event.clear()
 
         if request.stream:
diff --git a/interpreter/core/computer/ai/ai.py b/interpreter/core/computer/ai/ai.py
@@ -117,7 +117,7 @@ class Ai:
     def __init__(self, computer):
         self.computer = computer
 
-    def chat(self, text):
+    def chat(self, text, base64=None):
         messages = [
             {
                 "role": "system",
@@ -126,6 +126,10 @@ def chat(self, text):
             },
             {"role": "user", "type": "message", "content": text},
         ]
+        if base64:
+            messages.append(
+                {"role": "user", "type": "image", "format": "base64", "content": base64}
+            )
         response = ""
         for chunk in self.computer.interpreter.llm.run(messages):
             if "content" in chunk:
diff --git a/interpreter/core/computer/browser/browser.py b/interpreter/core/computer/browser/browser.py
@@ -1,9 +1,28 @@
+import time
+
+import html2text
 import requests
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from webdriver_manager.chrome import ChromeDriverManager
 
 
 class Browser:
     def __init__(self, computer):
         self.computer = computer
+        self._driver = None
+
+    @property
+    def driver(self):
+        if self._driver is None:
+            self.setup()
+        return self._driver
+
+    @driver.setter
+    def driver(self, value):
+        self._driver = value
 
     def search(self, query):
         """
@@ -14,3 +33,89 @@ def search(self, query):
             params={"query": query},
         )
         return response.json()["result"]
+
+    def setup(self):
+        self.service = Service(ChromeDriverManager().install())
+        self.options = webdriver.ChromeOptions()
+        self._driver = webdriver.Chrome(service=self.service, options=self.options)
+
+    def go_to_url(self, url):
+        """Navigate to a URL"""
+        self.driver.get(url)
+        time.sleep(3)
+
+    def search_google(self, query):
+        """Perform a Google search"""
+        self.driver.get("https://www.perplexity.ai")
+        # search_box = self.driver.find_element(By.NAME, 'q')
+        # search_box.send_keys(query)
+        # search_box.send_keys(Keys.RETURN)
+        body = self.driver.find_element(By.TAG_NAME, "body")
+        body.send_keys(Keys.COMMAND + "k")
+        time.sleep(0.5)
+        active_element = self.driver.switch_to.active_element
+        active_element.send_keys(query)
+        active_element.send_keys(Keys.RETURN)
+        time.sleep(5)
+
+    def analyze_page(self, intent):
+        """Extract HTML, list interactive elements, and analyze with AI"""
+        html_content = self.driver.page_source
+        text_content = html2text.html2text(html_content)
+
+        elements = (
+            self.driver.find_elements(By.TAG_NAME, "a")
+            + self.driver.find_elements(By.TAG_NAME, "button")
+            + self.driver.find_elements(By.TAG_NAME, "input")
+            + self.driver.find_elements(By.TAG_NAME, "select")
+        )
+
+        elements_info = [
+            {
+                "id": idx,
+                "text": elem.text,
+                "attributes": elem.get_attribute("outerHTML"),
+            }
+            for idx, elem in enumerate(elements)
+        ]
+
+        ai_query = f"""
+        Below is the content of the current webpage along with interactive elements. 
+        Given the intent "{intent}", please extract useful information and provide sufficient details 
+        about interactive elements, focusing especially on those pertinent to the provided intent.
+        
+        If the information requested by the intent "{intent}" is present on the page, simply return that.
+
+        If not, return the top 10 most relevant interactive elements in a concise, actionable format, listing them on separate lines
+        with their ID, a description, and their possible action.
+
+        Do not hallucinate.
+
+        Page Content:
+        {text_content}
+        
+        Interactive Elements:
+        {elements_info}
+        """
+
+        # response = self.computer.ai.chat(ai_query)
+
+        # screenshot = self.driver.get_screenshot_as_base64()
+        # old_model = self.computer.interpreter.llm.model
+        # self.computer.interpreter.llm.model = "gpt-4o-mini"
+        # response = self.computer.ai.chat(ai_query, base64=screenshot)
+        # self.computer.interpreter.llm.model = old_model
+
+        old_model = self.computer.interpreter.llm.model
+        self.computer.interpreter.llm.model = "gpt-4o-mini"
+        response = self.computer.ai.chat(ai_query)
+        self.computer.interpreter.llm.model = old_model
+
+        print(response)
+        print(
+            "Please now utilize this information or interact with the interactive elements provided to answer the user's query."
+        )
+
+    def quit(self):
+        """Close the browser"""
+        self.driver.quit()
diff --git a/interpreter/core/computer/display/display.py b/interpreter/core/computer/display/display.py
@@ -74,12 +74,23 @@ def info(self):
         """
         return get_displays()
 
-    def view(self, show=True, quadrant=None, screen=0, combine_screens=True):
+    def view(
+        self,
+        show=True,
+        quadrant=None,
+        screen=0,
+        combine_screens=True,
+        active_app_only=True,
+    ):
         """
         Redirects to self.screenshot
         """
         return self.screenshot(
-            screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens
+            screen=screen,
+            show=show,
+            quadrant=quadrant,
+            combine_screens=combine_screens,
+            active_app_only=active_app_only,
         )
 
     # def get_active_window(self):
@@ -149,7 +160,7 @@ def screenshot(
                     screen=screen, combine_screens=combine_screens
                 )  #  this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
                 message = format_to_recipient(
-                    "Taking a screenshot of the entire screen.\n\nTo focus on the active app, use computer.view(active_app_only=True).",
+                    "Taking a screenshot of the entire screen.\n\nTo focus on the active app, use computer.display.view(active_app_only=True).",
                     "assistant",
                 )
                 print(message)
diff --git a/interpreter/core/computer/keyboard/keyboard.py b/interpreter/core/computer/keyboard/keyboard.py
@@ -68,6 +68,14 @@ def press(self, *args, presses=1, interval=0.1):
         pyautogui.press(keys, presses=presses, interval=interval)
         time.sleep(0.15)
 
+    def press_and_release(self, *args, presses=1, interval=0.1):
+        """
+        Press and release a key or a sequence of keys.
+
+        This method is a perfect proxy for the press method.
+        """
+        return self.press(*args, presses=presses, interval=interval)
+
     def hotkey(self, *args, interval=0.1):
         """
         Press a sequence of keys in the order they are provided, and then release them in reverse order.
diff --git a/interpreter/core/computer/skills/skills.py b/interpreter/core/computer/skills/skills.py
@@ -40,7 +40,14 @@ def run(self, skill):
         )
 
     def search(self, query):
-        return aifs.search(query, self.path, python_docstrings_only=True)
+        """
+        This just lists all for now.
+        """
+        return [
+            file.replace(".py", "()")
+            for file in os.listdir(self.path)
+            if file.endswith(".py")
+        ]
 
     def import_skills(self):
         previous_save_skills_setting = self.computer.save_skills
diff --git a/interpreter/core/computer/terminal/languages/jupyter_language.py b/interpreter/core/computer/terminal/languages/jupyter_language.py
@@ -92,21 +92,21 @@ def run(self, code):
         ### OFFICIAL OPEN INTERPRETER GOVERNMENT ISSUE SKILL LIBRARY ###
         ################################################################
 
-        try:
-            functions = string_to_python(code)
-        except:
-            # Non blocking
-            functions = {}
+        # try:
+        #     functions = string_to_python(code)
+        # except:
+        #     # Non blocking
+        #     functions = {}
 
-        if self.computer.save_skills and functions:
-            skill_library_path = self.computer.skills.path
+        # if self.computer.save_skills and functions:
+        #     skill_library_path = self.computer.skills.path
 
-            if not os.path.exists(skill_library_path):
-                os.makedirs(skill_library_path)
+        #     if not os.path.exists(skill_library_path):
+        #         os.makedirs(skill_library_path)
 
-            for filename, function_code in functions.items():
-                with open(f"{skill_library_path}/{filename}.py", "w") as file:
-                    file.write(function_code)
+        #     for filename, function_code in functions.items():
+        #         with open(f"{skill_library_path}/{filename}.py", "w") as file:
+        #             file.write(function_code)
 
         self.finish_flag = False
         try:
diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py
@@ -308,12 +308,14 @@ def respond(interpreter):
                     code = re.sub(r"import computer\.\w+\n", "pass\n", code)
                     # If it does this it sees the screenshot twice (which is expected jupyter behavior)
                     if any(
-                        code.strip().split("\n")[-1].startswith(text)
-                        for text in [
-                            "computer.display.view",
-                            "computer.display.screenshot",
-                            "computer.view",
-                            "computer.screenshot",
+                        [
+                            code.strip().split("\n")[-1].startswith(text)
+                            for text in [
+                                "computer.display.view",
+                                "computer.display.screenshot",
+                                "computer.view",
+                                "computer.screenshot",
+                            ]
                         ]
                     ):
                         code = code + "\npass"
diff --git a/interpreter/terminal_interface/profiles/defaults/the01.py b/interpreter/terminal_interface/profiles/defaults/the01.py
diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml