Lazy import modules

KillianLucas · KillianLucas · commit c857a0fccf30 · 2024-05-04T13:14:47.000-07:00
diff --git a/interpreter/core/computer/display/display.py b/interpreter/core/computer/display/display.py
@@ -1,19 +1,19 @@
 import base64
+import io
 import os
 import platform
 import pprint
+import subprocess
 import time
 import warnings
 from contextlib import redirect_stdout
 from io import BytesIO
-import io
-import subprocess
-from PIL import Image
+
 import requests
+from PIL import Image
+
 from ...utils.lazy_import import lazy_import
 from ..utils.recipient_utils import format_to_recipient
-from screeninfo import get_monitors # for getting info about connected monitors
-
 
 # Still experimenting with this
 # from utils.get_active_window import get_active_window
@@ -23,6 +23,7 @@
 pyautogui = lazy_import("pyautogui")
 np = lazy_import("numpy")
 plt = lazy_import("matplotlib.pyplot")
+screeninfo = lazy_import("screeninfo")
 
 
 from ..utils.computer_vision import find_text_in_image, pytesseract_get_text
@@ -66,20 +67,26 @@ def info(self):
         Returns a list of all connected montitor/displays and thir information
         """
         return get_displays()
-    
-    
-    def view(self, show=True, quadrant=None, screen=0, combine_screens=True
-    ):
+
+    def view(self, show=True, quadrant=None, screen=0, combine_screens=True):
         """
         Redirects to self.screenshot
         """
-        return self.screenshot(screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens)
+        return self.screenshot(
+            screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens
+        )
 
     # def get_active_window(self):
     #     return get_active_window()
 
     def screenshot(
-        self, screen=0, show=True, quadrant=None, active_app_only=False, force_image=False,combine_screens=True
+        self,
+        screen=0,
+        show=True,
+        quadrant=None,
+        active_app_only=False,
+        force_image=False,
+        combine_screens=True,
     ):
         """
         Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
@@ -106,7 +113,9 @@ def screenshot(
                 region = self.get_active_window()["region"]
                 screenshot = pyautogui.screenshot(region=region)
             else:
-                screenshot = take_screenshot_to_pil(screen=screen, combine_screens=combine_screens) #  this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
+                screenshot = take_screenshot_to_pil(
+                    screen=screen, combine_screens=combine_screens
+                )  #  this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
                 # message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
                 # print(message)
 
@@ -134,7 +143,9 @@ def screenshot(
         # Open the image file with PIL
         # IPython interactive mode auto-displays plots, causing RGBA handling issues, possibly MacOS-specific.
         if isinstance(screenshot, list):
-            screenshot = [img.convert("RGB") for img in screenshot] # if screenshot is a list (i.e combine_screens=False).
+            screenshot = [
+                img.convert("RGB") for img in screenshot
+            ]  # if screenshot is a list (i.e combine_screens=False).
         else:
             screenshot = screenshot.convert("RGB")
 
@@ -152,7 +163,7 @@ def screenshot(
                 warnings.simplefilter("ignore")
                 plt.show()
 
-        return screenshot # this will be a list of combine_screens == False
+        return screenshot  # this will be a list of combine_screens == False
 
     def find(self, description, screenshot=None):
         if description.startswith('"') and description.endswith('"'):
@@ -282,19 +293,23 @@ def get_text_as_list_of_lists(self, screenshot=None):
 
 def take_screenshot_to_pil(screen=0, combine_screens=True):
     # Get information about all screens
-    monitors = get_monitors()
-    if screen == -1: # All screens
-        
+    monitors = screeninfo.get_monitors()
+    if screen == -1:  # All screens
         # Take a screenshot of each screen and save them in a list
-        screenshots = [pyautogui.screenshot(region=(monitor.x, monitor.y, monitor.width, monitor.height)) for monitor in monitors]
+        screenshots = [
+            pyautogui.screenshot(
+                region=(monitor.x, monitor.y, monitor.width, monitor.height)
+            )
+            for monitor in monitors
+        ]
 
         if combine_screens:
             # Combine all screenshots horizontally
             total_width = sum([img.width for img in screenshots])
             max_height = max([img.height for img in screenshots])
 
             # Create a new image with a size that can contain all screenshots
-            new_img = Image.new('RGB', (total_width, max_height))
+            new_img = Image.new("RGB", (total_width, max_height))
 
             # Paste each screenshot into the new image
             x_offset = 0
@@ -308,7 +323,9 @@ def take_screenshot_to_pil(screen=0, combine_screens=True):
                 new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_RGB2BGR)
 
                 # Paste each screenshot into the new image using OpenCV
-                new_img_cv[0:img_cv.shape[0], x_offset:x_offset+img_cv.shape[1]] = img_cv
+                new_img_cv[
+                    0 : img_cv.shape[0], x_offset : x_offset + img_cv.shape[1]
+                ] = img_cv
                 x_offset += img.width
 
                 # Add monitor labels using OpenCV
@@ -321,19 +338,27 @@ def take_screenshot_to_pil(screen=0, combine_screens=True):
                     text = "Primary Monitor"
                 else:
                     text = f"Monitor {i}"
-                
+
                 # Calculate the font scale that will fit the text perfectly in the center of the monitor
                 text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
                 font_scale = min(img.width / text_size[0], img.height / text_size[1])
-                
+
                 # Recalculate the text size with the new font scale
                 text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
-                
+
                 # Calculate the position to center the text
                 text_x = x_offset - img.width // 2 - text_size[0] // 2
                 text_y = max_height // 2 - text_size[1] // 2
-                
-                cv2.putText(new_img_cv, text, (text_x, text_y), font, font_scale, font_color, line_type)
+
+                cv2.putText(
+                    new_img_cv,
+                    text,
+                    (text_x, text_y),
+                    font,
+                    font_scale,
+                    font_color,
+                    line_type,
+                )
 
                 # Convert new_img from OpenCV Image back to PIL Image
                 new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_BGR2RGB)
@@ -344,14 +369,27 @@ def take_screenshot_to_pil(screen=0, combine_screens=True):
             return screenshots
     elif screen > 0:
         # Take a screenshot of the selected screen
-        return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
-        
+        return pyautogui.screenshot(
+            region=(
+                monitors[screen].x,
+                monitors[screen].y,
+                monitors[screen].width,
+                monitors[screen].height,
+            )
+        )
+
     else:
         # Take a screenshot of the primary screen
-        return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
+        return pyautogui.screenshot(
+            region=(
+                monitors[screen].x,
+                monitors[screen].y,
+                monitors[screen].width,
+                monitors[screen].height,
+            )
+        )
 
 
 def get_displays():
     monitors = get_monitors()
     return monitors
-    
diff --git a/interpreter/core/computer/vision/vision.py b/interpreter/core/computer/vision/vision.py
@@ -1,24 +1,34 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from PIL import Image
 import base64
 import io
 
+from PIL import Image
+
+from ...utils.lazy_import import lazy_import
+
+transformers = lazy_import("transformers")
+
 
 class Vision:
     def __init__(self, computer):
         self.computer = computer
-        self.model = None # Will load upon first use
-        self.tokenizer = None # Will load upon first use
+        self.model = None  # Will load upon first use
+        self.tokenizer = None  # Will load upon first use
 
     def load(self):
-        print("Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior.")
-        print("Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`.")
+        print(
+            "Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
+        )
+        print(
+            "Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
+        )
         model_id = "vikhyatk/moondream2"
         revision = "2024-04-02"
-        self.model = AutoModelForCausalLM.from_pretrained(
+        self.model = transformers.AutoModelForCausalLM.from_pretrained(
             model_id, trust_remote_code=True, revision=revision
         )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_id, revision=revision
+        )
 
     def query(self, query="Describe this image.", base_64=None, path=None, lmc=None):
         """