Skip to content

Commit 0ab4c50

Browse files
committed
Moondream for local vision + fixes, custom execution instructions
1 parent 59956e0 commit 0ab4c50

File tree

14 files changed

+400
-180
lines changed

14 files changed

+400
-180
lines changed

interpreter/core/computer/computer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from .skills.skills import Skills
1616
from .sms.sms import SMS
1717
from .terminal.terminal import Terminal
18-
18+
from .vision.vision import Vision
1919

2020
class Computer:
2121
def __init__(self, interpreter):
@@ -37,6 +37,7 @@ def __init__(self, interpreter):
3737
self.contacts = Contacts(self)
3838
self.browser = Browser(self)
3939
self.os = Os(self)
40+
self.vision = Vision(self)
4041
self.skills = Skills(self)
4142
self.docs = Docs(self)
4243
self.ai = Ai(self)

interpreter/core/computer/vision/__init__.py

Whitespace-only changes.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
from PIL import Image
3+
import base64
4+
import io
5+
6+
7+
class Vision:
8+
def __init__(self, computer):
9+
self.computer = computer
10+
self.model = None # Will load upon first use
11+
self.tokenizer = None # Will load upon first use
12+
13+
def load(self):
14+
print("Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior.")
15+
print("Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`.")
16+
model_id = "vikhyatk/moondream2"
17+
revision = "2024-04-02"
18+
self.model = AutoModelForCausalLM.from_pretrained(
19+
model_id, trust_remote_code=True, revision=revision
20+
)
21+
self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
22+
23+
def query(self, query="Describe this image.", base_64=None, path=None, lmc=None):
24+
"""
25+
Uses Moondream to ask query of the image (which can be a base64, path, or lmc message)
26+
"""
27+
28+
if self.model == None and self.tokenizer == None:
29+
self.load()
30+
31+
if lmc:
32+
if "base64" in lmc["format"]:
33+
# # Extract the extension from the format, default to 'png' if not specified
34+
# if "." in lmc["format"]:
35+
# extension = lmc["format"].split(".")[-1]
36+
# else:
37+
# extension = "png"
38+
39+
# Decode the base64 image
40+
img_data = base64.b64decode(lmc["content"])
41+
img = Image.open(io.BytesIO(img_data))
42+
43+
elif lmc["format"] == "path":
44+
# Convert to base64
45+
image_path = lmc["content"]
46+
img = Image.open(image_path)
47+
elif base_64:
48+
img_data = base64.b64decode(base_64)
49+
img = Image.open(io.BytesIO(img_data))
50+
elif path:
51+
img = Image.open(path)
52+
53+
enc_image = self.model.encode_image(img)
54+
return self.model.answer_question(enc_image, query, self.tokenizer)

interpreter/core/core.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ def __init__(
5252
force_task_completion=False,
5353
force_task_completion_message="""Proceed. You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task I asked for is done, say exactly 'The task is done.' If you need some specific information (like username or password) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going.""",
5454
force_task_completion_breakers=[
55-
"the task is done.",
56-
"the task is impossible.",
57-
"let me know what you'd like to do next.",
58-
"please provide more information.",
55+
"The task is done.",
56+
"The task is impossible.",
57+
"Let me know what you'd like to do next.",
58+
"Please provide more information.",
5959
],
6060
disable_telemetry=os.getenv("DISABLE_TELEMETRY", "false").lower() == "true",
6161
in_terminal_interface=False,
@@ -105,13 +105,6 @@ def __init__(
105105
self.os = os
106106
self.speak_messages = speak_messages
107107

108-
# LLM
109-
self.llm = Llm(self) if llm is None else llm
110-
111-
# These are LLM related
112-
self.system_message = system_message
113-
self.custom_instructions = custom_instructions
114-
115108
# Computer
116109
self.computer = Computer(self) if computer is None else computer
117110
self.sync_computer = sync_computer
@@ -123,6 +116,13 @@ def __init__(
123116

124117
self.computer.import_skills = import_skills
125118

119+
# LLM
120+
self.llm = Llm(self) if llm is None else llm
121+
122+
# These are LLM related
123+
self.system_message = system_message
124+
self.custom_instructions = custom_instructions
125+
126126
def server(self, *args, **kwargs):
127127
server(self, *args, **kwargs)
128128

interpreter/core/llm/llm.py

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,18 @@ def __init__(self, interpreter):
2121
# Store a reference to parent interpreter
2222
self.interpreter = interpreter
2323

24-
# Chat completions "endpoint"
24+
# OpenAI-compatible chat completions "endpoint"
2525
self.completions = fixed_litellm_completions
2626

2727
# Settings
2828
self.model = "gpt-4-turbo"
2929
self.temperature = 0
30-
self.supports_vision = False
31-
self.supports_functions = None # Will try to auto-detect
30+
31+
self.supports_vision = None # Will try to auto-detect
32+
self.vision_renderer = self.interpreter.computer.vision.query # Will only use if supports_vision is False
33+
34+
self.supports_functions = None # Will try to auto-detect
35+
self.execution_instructions = "To execute code on the user's machine, write a markdown code block. Specify the language after the ```. You will receive the output. Use any programming language." # If supports_functions is False, this will be added to the system message
3236

3337
# Optional settings
3438
self.context_window = None
@@ -67,11 +71,20 @@ def run(self, messages):
6771
self.supports_functions = False
6872
except:
6973
self.supports_functions = False
74+
75+
# Detect vision support
76+
if self.supports_vision == None:
77+
try:
78+
if litellm.supports_vision(self.model):
79+
self.supports_vision = True
80+
else:
81+
self.supports_vision = False
82+
except:
83+
self.supports_vision = False
7084

7185
# Trim image messages if they're there
86+
image_messages = [msg for msg in messages if msg["type"] == "image"]
7287
if self.supports_vision:
73-
image_messages = [msg for msg in messages if msg["type"] == "image"]
74-
7588
if self.interpreter.os:
7689
# Keep only the last two images if the interpreter is running in OS mode
7790
if len(image_messages) > 1:
@@ -87,6 +100,11 @@ def run(self, messages):
87100
if self.interpreter.verbose:
88101
print("Removing image message!")
89102
# Idea: we could set detail: low for the middle messages, instead of deleting them
103+
elif self.supports_vision == False and self.vision_renderer:
104+
for img_msg in image_messages:
105+
if img_msg["format"] != "description":
106+
img_msg["content"] = "Imagine I have just shown you an image with this description: " + self.vision_renderer(lmc=img_msg)
107+
img_msg["format"] = "description"
90108

91109
# Convert to OpenAI messages format
92110
messages = convert_to_openai_messages(
@@ -96,16 +114,6 @@ def run(self, messages):
96114
shrink_images=self.interpreter.shrink_images,
97115
)
98116

99-
if self.interpreter.debug:
100-
print("\n\n\nOPENAI COMPATIBLE MESSAGES\n\n\n")
101-
for message in messages:
102-
if len(str(message)) > 5000:
103-
print(str(message)[:200] + "...")
104-
else:
105-
print(message)
106-
print("\n")
107-
print("\n\n\n")
108-
109117
system_message = messages[0]["content"]
110118
messages = messages[1:]
111119

@@ -195,6 +203,17 @@ def run(self, messages):
195203
if self.interpreter.verbose:
196204
litellm.set_verbose = True
197205

206+
if self.interpreter.debug:
207+
print("\n\n\nOPENAI COMPATIBLE MESSAGES\n\n\n")
208+
for message in messages:
209+
if len(str(message)) > 5000:
210+
print(str(message)[:200] + "...")
211+
else:
212+
print(message)
213+
print("\n")
214+
print("\n\n\n")
215+
time.sleep(5)
216+
198217
if self.supports_functions:
199218
yield from run_function_calling_llm(self, params)
200219
else:
@@ -207,6 +226,10 @@ def fixed_litellm_completions(**params):
207226
Hopefully they will fix this!
208227
"""
209228

229+
if "local" in params.get("model"):
230+
# Kinda hacky, but this helps
231+
params["stop"] = ["<|assistant|>", "<|end|>"]
232+
210233
# Run completion
211234
first_error = None
212235
try:

interpreter/core/llm/run_text_llm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
def run_text_llm(llm, params):
22
## Setup
33

4-
if llm.interpreter.computer.terminal.languages != []:
4+
if llm.execution_instructions:
55
try:
66
# Add the system message
77
params["messages"][0][
88
"content"
9-
] += "\nTo execute code on the user's machine, write a markdown code block. Specify the language after the ```. You will receive the output. Use any programming language."
9+
] += "\n" + llm.execution_instructions
1010
except:
1111
print('params["messages"][0]', params["messages"][0])
1212
raise

0 commit comments

Comments
 (0)