openinterpreter
diff --git a/‎interpreter/core/computer/computer.py
Lines changed: 2 additions & 1 deletion b/‎interpreter/core/computer/computer.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎interpreter/core/computer/vision/__init__.py b/‎interpreter/core/computer/vision/__init__.py
diff --git a/‎interpreter/core/computer/vision/vision.py
Lines changed: 54 additions & 0 deletions b/‎interpreter/core/computer/vision/vision.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎interpreter/core/core.py
Lines changed: 11 additions & 11 deletions b/‎interpreter/core/core.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎interpreter/core/llm/llm.py
Lines changed: 38 additions & 15 deletions b/‎interpreter/core/llm/llm.py
Lines changed: 38 additions & 15 deletions
diff --git a/‎interpreter/core/llm/run_text_llm.py
Lines changed: 2 additions & 2 deletions b/‎interpreter/core/llm/run_text_llm.py
Lines changed: 2 additions & 2 deletions
@@ -15,7 +15,7 @@
 from .skills.skills import Skills
 from .sms.sms import SMS
 from .terminal.terminal import Terminal
-
+from .vision.vision import Vision
 
 class Computer:
     def __init__(self, interpreter):
@@ -37,6 +37,7 @@ def __init__(self, interpreter):
         self.contacts = Contacts(self)
         self.browser = Browser(self)
         self.os = Os(self)
+        self.vision = Vision(self)
         self.skills = Skills(self)
         self.docs = Docs(self)
         self.ai = Ai(self)
 
@@ -0,0 +1,54 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import base64
+import io
+
+
+class Vision:
+    def __init__(self, computer):
+        self.computer = computer
+        self.model = None # Will load upon first use
+        self.tokenizer = None # Will load upon first use
+
+    def load(self):
+        print("Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior.")
+        print("Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`.")
+        model_id = "vikhyatk/moondream2"
+        revision = "2024-04-02"
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id, trust_remote_code=True, revision=revision
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+
+    def query(self, query="Describe this image.", base_64=None, path=None, lmc=None):
+        """
+        Uses Moondream to ask query of the image (which can be a base64, path, or lmc message)
+        """
+
+        if self.model == None and self.tokenizer == None:
+            self.load()
+
+        if lmc:
+            if "base64" in lmc["format"]:
+                # # Extract the extension from the format, default to 'png' if not specified
+                # if "." in lmc["format"]:
+                #     extension = lmc["format"].split(".")[-1]
+                # else:
+                #     extension = "png"
+
+                # Decode the base64 image
+                img_data = base64.b64decode(lmc["content"])
+                img = Image.open(io.BytesIO(img_data))
+
+            elif lmc["format"] == "path":
+                # Convert to base64
+                image_path = lmc["content"]
+                img = Image.open(image_path)
+        elif base_64:
+            img_data = base64.b64decode(base_64)
+            img = Image.open(io.BytesIO(img_data))
+        elif path:
+            img = Image.open(path)
+
+        enc_image = self.model.encode_image(img)
+        return self.model.answer_question(enc_image, query, self.tokenizer)
@@ -52,10 +52,10 @@ def __init__(
         force_task_completion=False,
         force_task_completion_message="""Proceed. You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task I asked for is done, say exactly 'The task is done.' If you need some specific information (like username or password) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going.""",
         force_task_completion_breakers=[
-            "the task is done.",
-            "the task is impossible.",
-            "let me know what you'd like to do next.",
-            "please provide more information.",
+            "The task is done.",
+            "The task is impossible.",
+            "Let me know what you'd like to do next.",
+            "Please provide more information.",
         ],
         disable_telemetry=os.getenv("DISABLE_TELEMETRY", "false").lower() == "true",
         in_terminal_interface=False,
@@ -105,13 +105,6 @@ def __init__(
         self.os = os
         self.speak_messages = speak_messages
 
-        # LLM
-        self.llm = Llm(self) if llm is None else llm
-
-        # These are LLM related
-        self.system_message = system_message
-        self.custom_instructions = custom_instructions
-
         # Computer
         self.computer = Computer(self) if computer is None else computer
         self.sync_computer = sync_computer
@@ -123,6 +116,13 @@ def __init__(
 
         self.computer.import_skills = import_skills
 
+        # LLM
+        self.llm = Llm(self) if llm is None else llm
+
+        # These are LLM related
+        self.system_message = system_message
+        self.custom_instructions = custom_instructions
+
     def server(self, *args, **kwargs):
         server(self, *args, **kwargs)
 
 
@@ -21,14 +21,18 @@ def __init__(self, interpreter):
         # Store a reference to parent interpreter
         self.interpreter = interpreter
 
-        # Chat completions "endpoint"
+        # OpenAI-compatible chat completions "endpoint"
         self.completions = fixed_litellm_completions
 
         # Settings
         self.model = "gpt-4-turbo"
         self.temperature = 0
-        self.supports_vision = False
-        self.supports_functions = None  # Will try to auto-detect
+
+        self.supports_vision = None # Will try to auto-detect
+        self.vision_renderer = self.interpreter.computer.vision.query # Will only use if supports_vision is False
+
+        self.supports_functions = None # Will try to auto-detect
+        self.execution_instructions = "To execute code on the user's machine, write a markdown code block. Specify the language after the ```. You will receive the output. Use any programming language." # If supports_functions is False, this will be added to the system message
 
         # Optional settings
         self.context_window = None
@@ -67,11 +71,20 @@ def run(self, messages):
                     self.supports_functions = False
             except:
                 self.supports_functions = False
+
+        # Detect vision support
+        if self.supports_vision == None:
+            try:
+                if litellm.supports_vision(self.model):
+                    self.supports_vision = True
+                else:
+                    self.supports_vision = False
+            except:
+                self.supports_vision = False
 
         # Trim image messages if they're there
+        image_messages = [msg for msg in messages if msg["type"] == "image"]
         if self.supports_vision:
-            image_messages = [msg for msg in messages if msg["type"] == "image"]
-
             if self.interpreter.os:
                 # Keep only the last two images if the interpreter is running in OS mode
                 if len(image_messages) > 1:
@@ -87,6 +100,11 @@ def run(self, messages):
                         if self.interpreter.verbose:
                             print("Removing image message!")
                 # Idea: we could set detail: low for the middle messages, instead of deleting them
+        elif self.supports_vision == False and self.vision_renderer:
+            for img_msg in image_messages:
+                if img_msg["format"] != "description":
+                        img_msg["content"] = "Imagine I have just shown you an image with this description: " + self.vision_renderer(lmc=img_msg)
+                        img_msg["format"] = "description"
 
         # Convert to OpenAI messages format
         messages = convert_to_openai_messages(
@@ -96,16 +114,6 @@ def run(self, messages):
             shrink_images=self.interpreter.shrink_images,
         )
 
-        if self.interpreter.debug:
-            print("\n\n\nOPENAI COMPATIBLE MESSAGES\n\n\n")
-            for message in messages:
-                if len(str(message)) > 5000:
-                    print(str(message)[:200] + "...")
-                else:
-                    print(message)
-                print("\n")
-            print("\n\n\n")
-
         system_message = messages[0]["content"]
         messages = messages[1:]
 
@@ -195,6 +203,17 @@ def run(self, messages):
         if self.interpreter.verbose:
             litellm.set_verbose = True
 
+        if self.interpreter.debug:
+            print("\n\n\nOPENAI COMPATIBLE MESSAGES\n\n\n")
+            for message in messages:
+                if len(str(message)) > 5000:
+                    print(str(message)[:200] + "...")
+                else:
+                    print(message)
+                print("\n")
+            print("\n\n\n")
+            time.sleep(5)
+
         if self.supports_functions:
             yield from run_function_calling_llm(self, params)
         else:
@@ -207,6 +226,10 @@ def fixed_litellm_completions(**params):
     Hopefully they will fix this!
     """
 
+    if "local" in params.get("model"):
+        # Kinda hacky, but this helps
+        params["stop"] = ["<|assistant|>", "<|end|>"]
+
     # Run completion
     first_error = None
     try:
 
@@ -1,12 +1,12 @@
 def run_text_llm(llm, params):
     ## Setup
 
-    if llm.interpreter.computer.terminal.languages != []:
+    if llm.execution_instructions:
         try:
             # Add the system message
             params["messages"][0][
                 "content"
-            ] += "\nTo execute code on the user's machine, write a markdown code block. Specify the language after the ```. You will receive the output. Use any programming language."
+            ] += "\n" + llm.execution_instructions
         except:
             print('params["messages"][0]', params["messages"][0])
             raise