Skip to content

Commit c01a59d

Browse files
committed
Llama 3 Vision
1 parent 2b57bf8 commit c01a59d

File tree

4 files changed

+37
-3
lines changed

4 files changed

+37
-3
lines changed

interpreter/core/computer/computer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .terminal.terminal import Terminal
1818
from .vision.vision import Vision
1919

20+
2021
class Computer:
2122
def __init__(self, interpreter):
2223
self.interpreter = interpreter
@@ -93,6 +94,12 @@ def screenshot(self, *args, **kwargs):
9394
"""
9495
return self.display.screenshot(*args, **kwargs)
9596

97+
def view(self, *args, **kwargs):
98+
"""
99+
Shortcut for computer.display.screenshot
100+
"""
101+
return self.display.screenshot(*args, **kwargs)
102+
96103
def to_dict(self):
97104
def json_serializable(obj):
98105
try:

interpreter/core/computer/display/display.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,13 @@ def screenshot(
9494
:param combine_screens: If True, a collage of all display screens will be returned. Otherwise, a list of display screens will be returned.
9595
"""
9696
if not self.computer.emit_images and force_image == False:
97-
text = self.get_text_as_list_of_lists()
97+
screenshot = self.screenshot(show=False, force_image=True)
98+
99+
description = self.computer.vision.query(pil_image=screenshot)
100+
print("A DESCRIPTION OF WHAT'S ON THE SCREEN: " + description)
101+
102+
print("ALL OF THE TEXT ON THE SCREEN: ")
103+
text = self.get_text_as_list_of_lists(screenshot=screenshot)
98104
pp = pprint.PrettyPrinter(indent=4)
99105
pretty_text = pp.pformat(text) # language models like it pretty!
100106
pretty_text = format_to_recipient(pretty_text, "assistant")

interpreter/core/computer/vision/vision.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from ...utils.lazy_import import lazy_import
77

8-
transformers = lazy_import("transformers")
8+
# transformers = lazy_import("transformers") # Doesn't work for some reason! We import it later.
99

1010

1111
class Vision:
@@ -15,6 +15,8 @@ def __init__(self, computer):
1515
self.tokenizer = None # Will load upon first use
1616

1717
def load(self):
18+
import transformers # Wait until we use it. Transformers can't be lazy loaded for some reason!
19+
1820
print(
1921
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
2022
)
@@ -30,7 +32,14 @@ def load(self):
3032
model_id, revision=revision
3133
)
3234

33-
def query(self, query="Describe this image.", base_64=None, path=None, lmc=None):
35+
def query(
36+
self,
37+
query="Describe this image.",
38+
base_64=None,
39+
path=None,
40+
lmc=None,
41+
pil_image=None,
42+
):
3443
"""
3544
Uses Moondream to ask query of the image (which can be a base64, path, or lmc message)
3645
"""
@@ -59,6 +68,8 @@ def query(self, query="Describe this image.", base_64=None, path=None, lmc=None)
5968
img = Image.open(io.BytesIO(img_data))
6069
elif path:
6170
img = Image.open(path)
71+
elif pil_image:
72+
img = pil_image
6273

6374
enc_image = self.model.encode_image(img)
6475
return self.model.answer_question(enc_image, query, self.tokenizer)

tests/test_interpreter.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
import pytest
2222
from websocket import create_connection
2323

24+
25+
def test_localos():
26+
interpreter.computer.emit_images = False
27+
interpreter.computer.view()
28+
interpreter.computer.emit_images = True
29+
assert False
30+
31+
2432
@pytest.mark.skip(reason="Requires open-interpreter[local]")
2533
def test_m_vision():
2634
base64png = "iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAIAAADTED8xAAADMElEQVR4nOzVwQnAIBQFQYXff81RUkQCOyDj1YOPnbXWPmeTRef+/3O/OyBjzh3CD95BfqICMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMK0CMO0TAAD//2Anhf4QtqobAAAAAElFTkSuQmCC"
@@ -45,8 +53,10 @@ def test_m_vision():
4553

4654
interpreter.force_task_completion = False
4755
import time
56+
4857
time.sleep(10)
4958

59+
5060
@pytest.mark.skip(reason="Computer with display only + no way to fail test")
5161
def test_point():
5262
# interpreter.computer.debug = True

0 commit comments

Comments
 (0)