Merge pull request #1161 from Amazingct/collage-screen

KillianLucas · web-flow · commit 9dc20c2bc6cf · 2024-04-09T20:07:14.000-07:00
Multiple display support
diff --git a/interpreter/core/computer/display/display.py b/interpreter/core/computer/display/display.py
@@ -6,11 +6,15 @@
 import warnings
 from contextlib import redirect_stdout
 from io import BytesIO
-
+import io
+import subprocess
+from PIL import Image
 import requests
-
 from ...utils.lazy_import import lazy_import
 from ..utils.recipient_utils import format_to_recipient
+import cv2
+from screeninfo import get_monitors # for getting info about connected monitors
+
 
 # Still experimenting with this
 # from utils.get_active_window import get_active_window
@@ -20,6 +24,7 @@
 np = lazy_import("numpy")
 plt = lazy_import("matplotlib.pyplot")
 
+
 from ..utils.computer_vision import find_text_in_image, pytesseract_get_text
 
 
@@ -56,20 +61,30 @@ def center(self):
         """
         return self.width // 2, self.height // 2
 
-    def view(self, show=True, quadrant=None):
+    def info(self):
+        """
+        Returns a list of all connected montitor/displays and thir information
+        """
+        return get_displays()
+    
+    
+    def view(self, show=True, quadrant=None, screen=0, combine_screens=True
+    ):
         """
         Redirects to self.screenshot
         """
-        return self.screenshot(show, quadrant)
+        return self.screenshot(screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens)
 
     # def get_active_window(self):
     #     return get_active_window()
 
     def screenshot(
-        self, show=True, quadrant=None, active_app_only=False, force_image=False
+        self, screen=0, show=True, quadrant=None, active_app_only=False, force_image=False,combine_screens=True
     ):
         """
         Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
+        :param screen: specify which display; 0 for primary and 1 and above for secondary.
+        :param combine_screens: If True, a collage of all display screens will be returned. Otherwise, a list of display screens will be returned.
         """
         if not self.computer.emit_images and force_image == False:
             text = self.get_text_as_list_of_lists()
@@ -91,10 +106,7 @@ def screenshot(
                 region = self.get_active_window()["region"]
                 screenshot = pyautogui.screenshot(region=region)
             else:
-                if platform.system() == "Darwin":
-                    screenshot = take_screenshot_to_pil()
-                else:
-                    screenshot = pyautogui.screenshot()
+                screenshot = take_screenshot_to_pil(screen=screen, combine_screens=combine_screens) #  this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
                 # message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
                 # print(message)
 
@@ -121,18 +133,26 @@ def screenshot(
 
         # Open the image file with PIL
         # IPython interactive mode auto-displays plots, causing RGBA handling issues, possibly MacOS-specific.
-        screenshot = screenshot.convert("RGB")
+        if isinstance(screenshot, list):
+            screenshot = [img.convert("RGB") for img in screenshot] # if screenshot is a list (i.e combine_screens=False).
+        else:
+            screenshot = screenshot.convert("RGB")
 
         if show:
             # Show the image using matplotlib
-            plt.imshow(np.array(screenshot))
+            if isinstance(screenshot, list):
+                for img in screenshot:
+                    plt.imshow(np.array(img))
+                    plt.show()
+            else:
+                plt.imshow(np.array(screenshot))
 
             with warnings.catch_warnings():
                 # It displays an annoying message about Agg not being able to display something or WHATEVER
                 warnings.simplefilter("ignore")
                 plt.show()
 
-        return screenshot
+        return screenshot # this will be a list of combine_screens == False
 
     def find(self, description, screenshot=None):
         if description.startswith('"') and description.endswith('"'):
@@ -260,22 +280,78 @@ def get_text_as_list_of_lists(self, screenshot=None):
             )
 
 
-import io
-import subprocess
-
-from PIL import Image
-
-
-def take_screenshot_to_pil(filename="temp_screenshot.png"):
-    # Capture the screenshot and save it to a temporary file
-    subprocess.run(["screencapture", "-x", filename], check=True)
-
-    # Open the image file with PIL
-    with open(filename, "rb") as f:
-        image_data = f.read()
-    image = Image.open(io.BytesIO(image_data))
-
-    # Optionally, delete the temporary file if you don't need it after loading
-    os.remove(filename)
-
-    return image
+def take_screenshot_to_pil(screen=0, combine_screens=True):
+    # Get information about all screens
+    monitors = get_monitors()
+    if screen == -1: # All screens
+        
+        # Take a screenshot of each screen and save them in a list
+        screenshots = [pyautogui.screenshot(region=(monitor.x, monitor.y, monitor.width, monitor.height)) for monitor in monitors]
+
+        if combine_screens:
+            # Combine all screenshots horizontally
+            total_width = sum([img.width for img in screenshots])
+            max_height = max([img.height for img in screenshots])
+
+            # Create a new image with a size that can contain all screenshots
+            new_img = Image.new('RGB', (total_width, max_height))
+
+            # Paste each screenshot into the new image
+            x_offset = 0
+            for i, img in enumerate(screenshots):
+                # Convert PIL Image to OpenCV Image (numpy array)
+                img_cv = np.array(img)
+                img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
+
+                # Convert new_img PIL Image to OpenCV Image (numpy array)
+                new_img_cv = np.array(new_img)
+                new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_RGB2BGR)
+
+                # Paste each screenshot into the new image using OpenCV
+                new_img_cv[0:img_cv.shape[0], x_offset:x_offset+img_cv.shape[1]] = img_cv
+                x_offset += img.width
+
+                # Add monitor labels using OpenCV
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                font_scale = 4
+                font_color = (255, 255, 255)
+                line_type = 2
+
+                if i == 0:
+                    text = "Primary Monitor"
+                else:
+                    text = f"Monitor {i}"
+                
+                # Calculate the font scale that will fit the text perfectly in the center of the monitor
+                text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
+                font_scale = min(img.width / text_size[0], img.height / text_size[1])
+                
+                # Recalculate the text size with the new font scale
+                text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
+                
+                # Calculate the position to center the text
+                text_x = x_offset - img.width // 2 - text_size[0] // 2
+                text_y = max_height // 2 - text_size[1] // 2
+                
+                cv2.putText(new_img_cv, text, (text_x, text_y), font, font_scale, font_color, line_type)
+
+                # Convert new_img from OpenCV Image back to PIL Image
+                new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_BGR2RGB)
+                new_img = Image.fromarray(new_img_cv)
+
+            return new_img
+        else:
+            return screenshots
+    elif screen > 0:
+        # Take a screenshot of the selected screen
+        return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
+        
+    else:
+        # Take a screenshot of the primary screen
+        return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
+
+
+def get_displays():
+    monitors = get_monitors()
+    return monitors
+    
diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py
@@ -186,12 +186,12 @@ def respond(interpreter):
                     )
                     code = re.sub(r"import computer\.\w+\n", "pass\n", code)
                     # If it does this it sees the screenshot twice (which is expected jupyter behavior)
-                    if code.split("\n")[-1] in [
-                        "computer.display.view()",
-                        "computer.display.screenshot()",
-                        "computer.view()",
-                        "computer.screenshot()",
-                    ]:
+                    if any(code.split("\n")[-1].startswith(text) for text in [
+                        "computer.display.view",
+                        "computer.display.screenshot",
+                        "computer.view",
+                        "computer.screenshot",
+                    ]):
                         code = code + "\npass"
 
                 # sync up some things (is this how we want to do this?)
diff --git a/interpreter/terminal_interface/profiles/defaults/01.py b/interpreter/terminal_interface/profiles/defaults/01.py
@@ -90,7 +90,8 @@
 You may use the `computer` module to control the user's keyboard and mouse, if the task **requires** it:
 
 ```python
-computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
+computer.display.info() # Returns a list of connected monitors/Displays and their info (x and y cordinates, width, height, width_mm, height_mm, name). Use this to verify the monitors connected before using computer.display.view() when neccessary
+computer.display.view() # Shows you what's on the screen (primary display by default), returns a `pil_image` `in case you need it (rarely). To get a specific display, use the parameter screen=DISPLAY_NUMBER (0 for primary monitor 1 and above for secondary monitors). **You almost always want to do this first!**
 computer.keyboard.hotkey(" ", "command") # Opens spotlight
 computer.keyboard.write("hello")
 computer.mouse.click("text onscreen") # This clicks on the UI element with that text. Use this **frequently** and get creative! To click a video, you could pass the *timestamp* (which is usually written on the thumbnail) into this.
diff --git a/interpreter/terminal_interface/profiles/defaults/os.py b/interpreter/terminal_interface/profiles/defaults/os.py
@@ -36,7 +36,8 @@
 ```python
 computer.browser.search(query) # Silently searches Google for the query, returns result. The user's browser is unaffected. (does not open a browser!)
 
-computer.display.view() # Shows you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
+computer.display.info() # Returns a list of connected monitors/Displays and their info (x and y cordinates, width, height, width_mm, height_mm, name). Use this to verify the monitors connected before using computer.display.view() when neccessary
+computer.display.view() # Shows you what's on the screen (primary display by default), returns a `pil_image` `in case you need it (rarely). To get a specific display, use the parameter screen=DISPLAY_NUMBER (0 for primary monitor 1 and above for secondary monitors). **You almost always want to do this first!**
 
 computer.keyboard.hotkey(" ", "command") # Opens spotlight (very useful)
 computer.keyboard.write("hello")
diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py
@@ -361,12 +361,12 @@ def terminal_interface(interpreter, message):
                                 # (unless we figure out how to do this AFTER taking the screenshot)
                                 # otherwise it will try to click this notification!
 
-                                if action in [
-                                    "computer.screenshot()",
-                                    "computer.display.screenshot()",
-                                    "computer.display.view()",
-                                    "computer.view()",
-                                ]:
+                                if any(action.startswith(text) for text in [
+                                    "computer.screenshot",
+                                    "computer.display.screenshot",
+                                    "computer.display.view",
+                                    "computer.view"
+                                ]):
                                     description = "Viewing screen..."
                                 elif action == "computer.mouse.click()":
                                     description = "Clicking..."
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml