updated prompts to use display.info() and specify display when using display.view/screenshot

Amazingct · Amazingct · commit 459d77da4d5d · 2024-04-01T21:38:16.000+01:00
diff --git a/interpreter/core/computer/display/display.py b/interpreter/core/computer/display/display.py
@@ -61,18 +61,25 @@ def center(self):
         """
         return self.width // 2, self.height // 2
 
-    def view(self, show=True, quadrant=None, all_screens=True, combine_screens=True
+    def info(self):
+        """
+        Returns a list of all connected montitor/displays and thir information
+        """
+        return get_displays()
+    
+    
+    def view(self, show=True, quadrant=None, screen=0, combine_screens=True
     ):
         """
         Redirects to self.screenshot
         """
-        return self.screenshot(all_screens=all_screens, show=show, quadrant=quadrant, combine_screens=combine_screens)
+        return self.screenshot(screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens)
 
     # def get_active_window(self):
     #     return get_active_window()
 
     def screenshot(
-        self, all_screens=True, show=True, quadrant=None, active_app_only=False, force_image=False,combine_screens=True
+        self, screen=0, show=True, quadrant=None, active_app_only=False, force_image=False,combine_screens=True
     ):
         """
         Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
@@ -99,7 +106,7 @@ def screenshot(
                 region = self.get_active_window()["region"]
                 screenshot = pyautogui.screenshot(region=region)
             else:
-                screenshot = take_screenshot_to_pil(all_screens=all_screens, combine_screens=combine_screens) #  this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
+                screenshot = take_screenshot_to_pil(screen=screen, combine_screens=combine_screens) #  this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
                 # message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
                 # print(message)
 
@@ -273,10 +280,11 @@ def get_text_as_list_of_lists(self, screenshot=None):
             )
 
 
-def take_screenshot_to_pil(all_screens=False, combine_screens=True):
-    if all_screens:
-        # Get information about all screens
-        monitors = get_monitors()
+def take_screenshot_to_pil(screen=0, combine_screens=True):
+    # Get information about all screens
+    monitors = get_monitors()
+    if screen == -1: # All screens
+        
         # Take a screenshot of each screen and save them in a list
         screenshots = [pyautogui.screenshot(region=(monitor.x, monitor.y, monitor.width, monitor.height)) for monitor in monitors]
 
@@ -334,6 +342,16 @@ def take_screenshot_to_pil(all_screens=False, combine_screens=True):
             return new_img
         else:
             return screenshots
+    elif screen > 0:
+        # Take a screenshot of the selected screen
+        return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
+        
     else:
         # Take a screenshot of the primary screen
-        return pyautogui.screenshot()
+        return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
+
+
+def get_displays():
+    monitors = get_monitors()
+    return monitors
+    
diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py
@@ -186,12 +186,12 @@ def respond(interpreter):
                     )
                     code = re.sub(r"import computer\.\w+\n", "pass\n", code)
                     # If it does this it sees the screenshot twice (which is expected jupyter behavior)
-                    if code.split("\n")[-1] in [
-                        "computer.display.view()",
-                        "computer.display.screenshot()",
-                        "computer.view()",
-                        "computer.screenshot()",
-                    ]:
+                    if any(code.split("\n")[-1].startswith(text) for text in [
+                        "computer.display.view",
+                        "computer.display.screenshot",
+                        "computer.view",
+                        "computer.screenshot",
+                    ]):
                         code = code + "\npass"
 
                 # sync up some things (is this how we want to do this?)
diff --git a/interpreter/terminal_interface/profiles/defaults/01.py b/interpreter/terminal_interface/profiles/defaults/01.py
@@ -90,7 +90,8 @@
 You may use the `computer` module to control the user's keyboard and mouse, if the task **requires** it:
 
 ```python
-computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
+computer.display.info() # Returns a list of connected monitors/Displays and their info (x and y cordinates, width, height, width_mm, height_mm, name). Use this to verify the monitors connected before using computer.display.view() when neccessary
+computer.display.view() # Shows you what's on the screen (primary display by default), returns a `pil_image` `in case you need it (rarely). To get a specific display, use the parameter screen=DISPLAY_NUMBER (0 for primary monitor 1 and above for secondary monitors). **You almost always want to do this first!**
 computer.keyboard.hotkey(" ", "command") # Opens spotlight
 computer.keyboard.write("hello")
 computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
diff --git a/interpreter/terminal_interface/profiles/defaults/os.py b/interpreter/terminal_interface/profiles/defaults/os.py
@@ -35,7 +35,8 @@
 ```python
 computer.browser.search(query) # Silently searches Google for the query, returns result. The user's browser is unaffected. (does not open a browser!)
 
-computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
+computer.display.info() # Returns a list of connected monitors/Displays and their info (x and y cordinates, width, height, width_mm, height_mm, name). Use this to verify the monitors connected before using computer.display.view() when neccessary
+computer.display.view() # Shows you what's on the screen (primary display by default), returns a `pil_image` `in case you need it (rarely). To get a specific display, use the parameter screen=DISPLAY_NUMBER (0 for primary monitor 1 and above for secondary monitors). **You almost always want to do this first!**
 
 computer.keyboard.hotkey(" ", "command") # Opens spotlight (very useful)
 computer.keyboard.write("hello")
diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py
@@ -368,12 +368,12 @@ def terminal_interface(interpreter, message):
                                 # (unless we figure out how to do this AFTER taking the screenshot)
                                 # otherwise it will try to click this notification!
 
-                                if action in [
-                                    "computer.screenshot()",
-                                    "computer.display.screenshot()",
-                                    "computer.display.view()",
-                                    "computer.view()",
-                                ]:
+                                if any(action.startswith(text) for text in [
+                                    "computer.screenshot",
+                                    "computer.display.screenshot",
+                                    "computer.display.view",
+                                    "computer.view"
+                                ]):
                                     description = "Viewing screen..."
                                 elif action == "computer.mouse.click()":
                                     description = "Clicking..."
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml