Skip to content

Commit c857a0f

Browse files
committed
Lazy import modules
1 parent 749bf75 commit c857a0f

File tree

2 files changed

+85
-37
lines changed

2 files changed

+85
-37
lines changed

interpreter/core/computer/display/display.py

Lines changed: 67 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
import base64
2+
import io
23
import os
34
import platform
45
import pprint
6+
import subprocess
57
import time
68
import warnings
79
from contextlib import redirect_stdout
810
from io import BytesIO
9-
import io
10-
import subprocess
11-
from PIL import Image
11+
1212
import requests
13+
from PIL import Image
14+
1315
from ...utils.lazy_import import lazy_import
1416
from ..utils.recipient_utils import format_to_recipient
15-
from screeninfo import get_monitors # for getting info about connected monitors
16-
1717

1818
# Still experimenting with this
1919
# from utils.get_active_window import get_active_window
@@ -23,6 +23,7 @@
2323
pyautogui = lazy_import("pyautogui")
2424
np = lazy_import("numpy")
2525
plt = lazy_import("matplotlib.pyplot")
26+
screeninfo = lazy_import("screeninfo")
2627

2728

2829
from ..utils.computer_vision import find_text_in_image, pytesseract_get_text
@@ -66,20 +67,26 @@ def info(self):
6667
Returns a list of all connected montitor/displays and thir information
6768
"""
6869
return get_displays()
69-
70-
71-
def view(self, show=True, quadrant=None, screen=0, combine_screens=True
72-
):
70+
71+
def view(self, show=True, quadrant=None, screen=0, combine_screens=True):
7372
"""
7473
Redirects to self.screenshot
7574
"""
76-
return self.screenshot(screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens)
75+
return self.screenshot(
76+
screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens
77+
)
7778

7879
# def get_active_window(self):
7980
# return get_active_window()
8081

8182
def screenshot(
82-
self, screen=0, show=True, quadrant=None, active_app_only=False, force_image=False,combine_screens=True
83+
self,
84+
screen=0,
85+
show=True,
86+
quadrant=None,
87+
active_app_only=False,
88+
force_image=False,
89+
combine_screens=True,
8390
):
8491
"""
8592
Shows you what's on the screen by taking a screenshot of the entire screen or a specified quadrant. Returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first!**
@@ -106,7 +113,9 @@ def screenshot(
106113
region = self.get_active_window()["region"]
107114
screenshot = pyautogui.screenshot(region=region)
108115
else:
109-
screenshot = take_screenshot_to_pil(screen=screen, combine_screens=combine_screens) # this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
116+
screenshot = take_screenshot_to_pil(
117+
screen=screen, combine_screens=combine_screens
118+
) # this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
110119
# message = format_to_recipient("Taking a screenshot of the entire screen. This is not recommended. You (the language model assistant) will recieve it with low resolution.\n\nTo maximize performance, use computer.display.view(active_app_only=True). This will produce an ultra high quality image of the active application.", "assistant")
111120
# print(message)
112121

@@ -134,7 +143,9 @@ def screenshot(
134143
# Open the image file with PIL
135144
# IPython interactive mode auto-displays plots, causing RGBA handling issues, possibly MacOS-specific.
136145
if isinstance(screenshot, list):
137-
screenshot = [img.convert("RGB") for img in screenshot] # if screenshot is a list (i.e combine_screens=False).
146+
screenshot = [
147+
img.convert("RGB") for img in screenshot
148+
] # if screenshot is a list (i.e combine_screens=False).
138149
else:
139150
screenshot = screenshot.convert("RGB")
140151

@@ -152,7 +163,7 @@ def screenshot(
152163
warnings.simplefilter("ignore")
153164
plt.show()
154165

155-
return screenshot # this will be a list of combine_screens == False
166+
return screenshot # this will be a list of combine_screens == False
156167

157168
def find(self, description, screenshot=None):
158169
if description.startswith('"') and description.endswith('"'):
@@ -282,19 +293,23 @@ def get_text_as_list_of_lists(self, screenshot=None):
282293

283294
def take_screenshot_to_pil(screen=0, combine_screens=True):
284295
# Get information about all screens
285-
monitors = get_monitors()
286-
if screen == -1: # All screens
287-
296+
monitors = screeninfo.get_monitors()
297+
if screen == -1: # All screens
288298
# Take a screenshot of each screen and save them in a list
289-
screenshots = [pyautogui.screenshot(region=(monitor.x, monitor.y, monitor.width, monitor.height)) for monitor in monitors]
299+
screenshots = [
300+
pyautogui.screenshot(
301+
region=(monitor.x, monitor.y, monitor.width, monitor.height)
302+
)
303+
for monitor in monitors
304+
]
290305

291306
if combine_screens:
292307
# Combine all screenshots horizontally
293308
total_width = sum([img.width for img in screenshots])
294309
max_height = max([img.height for img in screenshots])
295310

296311
# Create a new image with a size that can contain all screenshots
297-
new_img = Image.new('RGB', (total_width, max_height))
312+
new_img = Image.new("RGB", (total_width, max_height))
298313

299314
# Paste each screenshot into the new image
300315
x_offset = 0
@@ -308,7 +323,9 @@ def take_screenshot_to_pil(screen=0, combine_screens=True):
308323
new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_RGB2BGR)
309324

310325
# Paste each screenshot into the new image using OpenCV
311-
new_img_cv[0:img_cv.shape[0], x_offset:x_offset+img_cv.shape[1]] = img_cv
326+
new_img_cv[
327+
0 : img_cv.shape[0], x_offset : x_offset + img_cv.shape[1]
328+
] = img_cv
312329
x_offset += img.width
313330

314331
# Add monitor labels using OpenCV
@@ -321,19 +338,27 @@ def take_screenshot_to_pil(screen=0, combine_screens=True):
321338
text = "Primary Monitor"
322339
else:
323340
text = f"Monitor {i}"
324-
341+
325342
# Calculate the font scale that will fit the text perfectly in the center of the monitor
326343
text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
327344
font_scale = min(img.width / text_size[0], img.height / text_size[1])
328-
345+
329346
# Recalculate the text size with the new font scale
330347
text_size = cv2.getTextSize(text, font, font_scale, line_type)[0]
331-
348+
332349
# Calculate the position to center the text
333350
text_x = x_offset - img.width // 2 - text_size[0] // 2
334351
text_y = max_height // 2 - text_size[1] // 2
335-
336-
cv2.putText(new_img_cv, text, (text_x, text_y), font, font_scale, font_color, line_type)
352+
353+
cv2.putText(
354+
new_img_cv,
355+
text,
356+
(text_x, text_y),
357+
font,
358+
font_scale,
359+
font_color,
360+
line_type,
361+
)
337362

338363
# Convert new_img from OpenCV Image back to PIL Image
339364
new_img_cv = cv2.cvtColor(new_img_cv, cv2.COLOR_BGR2RGB)
@@ -344,14 +369,27 @@ def take_screenshot_to_pil(screen=0, combine_screens=True):
344369
return screenshots
345370
elif screen > 0:
346371
# Take a screenshot of the selected screen
347-
return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
348-
372+
return pyautogui.screenshot(
373+
region=(
374+
monitors[screen].x,
375+
monitors[screen].y,
376+
monitors[screen].width,
377+
monitors[screen].height,
378+
)
379+
)
380+
349381
else:
350382
# Take a screenshot of the primary screen
351-
return pyautogui.screenshot(region=(monitors[screen].x, monitors[screen].y, monitors[screen].width, monitors[screen].height))
383+
return pyautogui.screenshot(
384+
region=(
385+
monitors[screen].x,
386+
monitors[screen].y,
387+
monitors[screen].width,
388+
monitors[screen].height,
389+
)
390+
)
352391

353392

354393
def get_displays():
355394
monitors = get_monitors()
356395
return monitors
357-

interpreter/core/computer/vision/vision.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,34 @@
1-
from transformers import AutoModelForCausalLM, AutoTokenizer
2-
from PIL import Image
31
import base64
42
import io
53

4+
from PIL import Image
5+
6+
from ...utils.lazy_import import lazy_import
7+
8+
transformers = lazy_import("transformers")
9+
610

711
class Vision:
812
def __init__(self, computer):
913
self.computer = computer
10-
self.model = None # Will load upon first use
11-
self.tokenizer = None # Will load upon first use
14+
self.model = None # Will load upon first use
15+
self.tokenizer = None # Will load upon first use
1216

1317
def load(self):
14-
print("Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior.")
15-
print("Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`.")
18+
print(
19+
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
20+
)
21+
print(
22+
"Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
23+
)
1624
model_id = "vikhyatk/moondream2"
1725
revision = "2024-04-02"
18-
self.model = AutoModelForCausalLM.from_pretrained(
26+
self.model = transformers.AutoModelForCausalLM.from_pretrained(
1927
model_id, trust_remote_code=True, revision=revision
2028
)
21-
self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
29+
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
30+
model_id, revision=revision
31+
)
2232

2333
def query(self, query="Describe this image.", base_64=None, path=None, lmc=None):
2434
"""

0 commit comments

Comments
 (0)