openinterpreter
diff --git a/‎interpreter/core/computer/ai/ai.py
Lines changed: 27 additions & 0 deletions b/‎interpreter/core/computer/ai/ai.py
Lines changed: 27 additions & 0 deletions
diff --git a/‎interpreter/core/computer/utils/html_to_png_base64.py
Lines changed: 5 additions & 1 deletion b/‎interpreter/core/computer/utils/html_to_png_base64.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎interpreter/core/llm/llm.py
Lines changed: 19 additions & 2 deletions b/‎interpreter/core/llm/llm.py
Lines changed: 19 additions & 2 deletions
diff --git a/‎interpreter/core/respond.py
Lines changed: 2 additions & 0 deletions b/‎interpreter/core/respond.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎interpreter/terminal_interface/local_setup.py
Lines changed: 35 additions & 3 deletions b/‎interpreter/terminal_interface/local_setup.py
Lines changed: 35 additions & 3 deletions
diff --git a/‎interpreter/terminal_interface/profiles/defaults/01.py
Lines changed: 2 additions & 2 deletions b/‎interpreter/terminal_interface/profiles/defaults/01.py
Lines changed: 2 additions & 2 deletions
@@ -118,19 +118,46 @@ def __init__(self, computer):
         self.computer = computer
 
     def chat(self, text):
+        messages = [
+            {
+                "role": "system",
+                "type": "message",
+                "content": "You are a helpful AI assistant.",
+            },
+            {"role": "user", "type": "message", "content": text},
+        ]
+        response = ""
+        for chunk in self.computer.interpreter.llm.run(messages):
+            if "content" in chunk:
+                response += chunk.get("content")
+        return response
+
+        # Old way
         old_messages = self.computer.interpreter.llm.interpreter.messages
         old_system_message = self.computer.interpreter.llm.interpreter.system_message
+        old_import_computer_api = self.computer.import_computer_api
+        old_execution_instructions = (
+            self.computer.interpreter.llm.execution_instructions
+        )
         try:
             self.computer.interpreter.llm.interpreter.system_message = (
                 "You are an AI assistant."
             )
             self.computer.interpreter.llm.interpreter.messages = []
+            self.computer.import_computer_api = False
+            self.computer.interpreter.llm.execution_instructions = ""
+
             response = self.computer.interpreter.llm.interpreter.chat(text)
         finally:
             self.computer.interpreter.llm.interpreter.messages = old_messages
             self.computer.interpreter.llm.interpreter.system_message = (
                 old_system_message
             )
+            self.computer.import_computer_api = old_import_computer_api
+            self.computer.interpreter.llm.execution_instructions = (
+                old_execution_instructions
+            )
+
             return response[-1].get("content")
 
     def query(self, text, query, custom_reduce_query=None):
 
@@ -5,12 +5,16 @@
 
 from html2image import Html2Image
 
+from ....core.utils.lazy_import import lazy_import
+
+html2image = lazy_import("html2image")
+
 from ....terminal_interface.utils.local_storage_path import get_storage_path
 
 
 def html_to_png_base64(code):
     # Convert the HTML into an image using html2image
-    hti = Html2Image()
+    hti = html2image.Html2Image()
 
     # Generate a random filename for the temporary image
     temp_filename = "".join(random.choices(string.digits, k=10)) + ".png"
 
@@ -1,3 +1,6 @@
+import os
+
+os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 import litellm
 
 litellm.suppress_debug_info = True
@@ -72,6 +75,7 @@ def run(self, messages):
             model = "openai/i"
             if not hasattr(self.interpreter, "conversation_id"):  # Only do this once
                 self.context_window = 7000
+                self.api_key = "x"
                 self.max_tokens = 1000
                 self.api_base = "https://api.openinterpreter.com/v0"
                 self.interpreter.conversation_id = str(uuid.uuid4())
@@ -117,12 +121,25 @@ def run(self, messages):
         elif self.supports_vision == False and self.vision_renderer:
             for img_msg in image_messages:
                 if img_msg["format"] != "description":
+                    self.interpreter.display_message("*Viewing image...*")
+
+                    if img_msg["format"] == "path":
+                        precursor = f"The image I'm referring to ({img_msg['content']}) contains the following: "
+                        if self.interpreter.computer.import_computer_api:
+                            postcursor = f"\nIf you want to ask questions about the image, run `computer.vision.query(path='{img_msg['content']}', query='(ask any question here)')` and a vision AI will answer it."
+                        else:
+                            postcursor = ""
+                    else:
+                        precursor = "Imagine I have just shown you an image with this description: "
+                        postcursor = ""
+
                     img_msg["content"] = (
-                        "Imagine I have just shown you an image with this description: "
+                        precursor
                         + self.vision_renderer(lmc=img_msg)
-                        + "\n---\nThe image contains the following text exactly, extracted via OCR: '''\n"
+                        + "\n---\nThe image contains the following text exactly: '''\n"
                         + self.interpreter.computer.vision.ocr(lmc=img_msg)
                         + "\n'''"
+                        + postcursor
                     )
                     img_msg["format"] = "description"
 
 
@@ -1,7 +1,9 @@
 import json
+import os
 import re
 import traceback
 
+os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
 import litellm
 
 from ..terminal_interface.utils.display_markdown_message import display_markdown_message
 
@@ -234,9 +234,19 @@ def download_model(models_dir, models, interpreter):
                 if line.strip()
             ]  # Extract names, trim out ":latest", skip header
 
-            for model in ["llama3", "phi3", "wizardlm2"]:
+            if "llama3" in names:
+                names.remove("llama3")
+                names = ["llama3"] + names
+
+            if "codestral" in names:
+                names.remove("codestral")
+                names = ["codestral"] + names
+
+            for model in ["llama3", "phi3", "wizardlm2", "codestral"]:
                 if model not in names:
-                    names.append("→ Download " + model)
+                    names.append("↓ Download " + model)
+
+            names.append("Browse Models ↗")
 
             # Create a new inquirer selection from the names
             name_question = [
@@ -253,15 +263,37 @@ def download_model(models_dir, models, interpreter):
 
             selected_name = name_answer["name"]
 
-            if "download" in selected_name.lower():
+            if "↓ Download " in selected_name:
                 model = selected_name.split(" ")[-1]
                 interpreter.display_message(f"\nDownloading {model}...\n")
                 subprocess.run(["ollama", "pull", model], check=True)
+            elif "Browse Models ↗" in selected_name:
+                interpreter.display_message(
+                    "Opening [ollama.com/library](ollama.com/library)."
+                )
+                import webbrowser
+
+                webbrowser.open("https://ollama.com/library")
+                exit()
             else:
                 model = selected_name.strip()
 
             # Set the model to the selected model
             interpreter.llm.model = f"ollama/{model}"
+
+            # Send a ping, which will actually load the model
+            interpreter.display_message("Loading model...")
+
+            old_max_tokens = interpreter.llm.max_tokens
+            old_context_window = interpreter.llm.context_window
+            interpreter.llm.max_tokens = 1
+            interpreter.llm.context_window = 100
+
+            interpreter.computer.ai.chat("ping")
+
+            interpreter.llm.max_tokens = old_max_tokens
+            interpreter.llm.context_window = old_context_window
+
             interpreter.display_message(f"> Model set to `{model}`")
 
         # If Ollama is not installed or not recognized as a command, prompt the user to download Ollama and try again
 
@@ -5,13 +5,13 @@
 interpreter.llm.supports_vision = True
 interpreter.shrink_images = True  # Faster but less accurate
 
-interpreter.llm.model = "gpt-4-vision-preview"
+interpreter.llm.model = "gpt-4o"
 
 interpreter.llm.supports_functions = False
 interpreter.llm.context_window = 110000
 interpreter.llm.max_tokens = 4096
 interpreter.auto_run = True
-
+interpreter.computer.import_computer_api = True
 interpreter.force_task_completion = True
 interpreter.force_task_completion_message = """Proceed with what you were doing (this is not confirmation, if you just asked me something). You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task is done, say exactly 'The task is done.' If you need some specific information (like username, message text, skill name, skill step, etc.) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going. CRITICAL: REMEMBER TO FOLLOW ALL PREVIOUS INSTRUCTIONS. If I'm teaching you something, remember to run the related `computer.skills.new_skill` function."""
 interpreter.force_task_completion_breakers = [