Merge pull request #1318 from MikeBirdTech/update-vision

KillianLucas · web-flow · commit 743170919194 · 2024-06-22T20:32:43.000-07:00
Update vision model to gpt-4o
diff --git a/docs/guides/profiles.mdx b/docs/guides/profiles.mdx
@@ -18,9 +18,9 @@ from interpreter import interpreter
 interpreter.os = True
 interpreter.llm.supports_vision = True
 
-interpreter.llm.model = "gpt-4-vision-preview"
+interpreter.llm.model = "gpt-4o"
 
-interpreter.llm.supports_functions = False
+interpreter.llm.supports_functions = True
 interpreter.llm.context_window = 110000
 interpreter.llm.max_tokens = 4096
 interpreter.auto_run = True
diff --git a/docs/settings/all-settings.mdx b/docs/settings/all-settings.mdx
@@ -280,17 +280,17 @@ llm:
 
 ### Vision Mode
 
-Enables vision mode, which adds some special instructions to the prompt and switches to `gpt-4-vision-preview`.
+Enables vision mode, which adds some special instructions to the prompt and switches to `gpt-4o`.
 
 <CodeGroup>
 ```bash Terminal
 interpreter --vision
 ```
 
 ```python Python
-interpreter.llm.model = "gpt-4-vision-preview" # Any vision supporting model
+interpreter.llm.model = "gpt-4o" # Any vision supporting model
 interpreter.llm.supports_vision = True
-interpreter.llm.supports_functions = False # If model doesn't support functions, which is the case with gpt-4-vision.
+interpreter.llm.supports_functions = True
 
 interpreter.custom_instructions = """The user will show you an image of the code you write. You can view images directly.
 For HTML: This will be run STATELESSLY. You may NEVER write '<!-- previous code here... --!>' or `<!-- header will go here -->` or anything like that. It is CRITICAL TO NEVER WRITE PLACEHOLDERS. Placeholders will BREAK it. You must write the FULL HTML CODE EVERY TIME. Therefore you cannot write HTML piecemeal—write all the HTML, CSS, and possibly Javascript **in one step, in one code block**. The user will help you review it visually.
@@ -302,10 +302,10 @@ If you use `plt.show()`, the resulting image will be sent to you. However, if yo
 loop: True
 
 llm:
-  model: "gpt-4-vision-preview"
+  model: "gpt-4o"
   temperature: 0
   supports_vision: True
-  supports_functions: False
+  supports_functions: True
   context_window: 110000
   max_tokens: 4096
   custom_instructions: >
diff --git a/docs/usage/terminal/vision.mdx b/docs/usage/terminal/vision.mdx
@@ -8,4 +8,4 @@ To use vision (highly experimental), run the following command:
 interpreter --vision
 ```
 
-If a file path to an image is found in your input, it will be loaded into the vision model (`gpt-4-vision-preview` for now).
+If a file path to an image is found in your input, it will be loaded into the vision model (`gpt-4o` for now).
diff --git a/interpreter/terminal_interface/profiles/defaults/os.py b/interpreter/terminal_interface/profiles/defaults/os.py
@@ -6,11 +6,11 @@
 interpreter.llm.supports_vision = True
 # interpreter.shrink_images = True # Faster but less accurate
 
-interpreter.llm.model = "gpt-4-vision-preview"
+interpreter.llm.model = "gpt-4o"
 
 interpreter.computer.import_computer_api = True
 
-interpreter.llm.supports_functions = False
+interpreter.llm.supports_functions = True
 interpreter.llm.context_window = 110000
 interpreter.llm.max_tokens = 4096
 interpreter.auto_run = True
diff --git a/interpreter/terminal_interface/profiles/defaults/vision.yaml b/interpreter/terminal_interface/profiles/defaults/vision.yaml
@@ -3,10 +3,10 @@
 loop: True
 
 llm:
-  model: "gpt-4-vision-preview"
+  model: "gpt-4o"
   temperature: 0
   supports_vision: True
-  supports_functions: False
+  supports_functions: True
   context_window: 110000
   max_tokens: 4096
   custom_instructions: >
diff --git a/tests/test_interpreter.py b/tests/test_interpreter.py
@@ -662,9 +662,9 @@ def test_vision():
     ]
 
     interpreter.llm.supports_vision = True
-    interpreter.llm.model = "gpt-4-vision-preview"
+    interpreter.llm.model = "gpt-4o"
     interpreter.system_message += "\nThe user will show you an image of the code you write. You can view images directly.\n\nFor HTML: This will be run STATELESSLY. You may NEVER write '<!-- previous code here... --!>' or `<!-- header will go here -->` or anything like that. It is CRITICAL TO NEVER WRITE PLACEHOLDERS. Placeholders will BREAK it. You must write the FULL HTML CODE EVERY TIME. Therefore you cannot write HTML piecemeal—write all the HTML, CSS, and possibly Javascript **in one step, in one code block**. The user will help you review it visually.\nIf the user submits a filepath, you will also see the image. The filepath and user image will both be in the user's message.\n\nIf you use `plt.show()`, the resulting image will be sent to you. However, if you use `PIL.Image.show()`, the resulting image will NOT be sent to you."
-    interpreter.llm.supports_functions = False
+    interpreter.llm.supports_functions = True
     interpreter.llm.context_window = 110000
     interpreter.llm.max_tokens = 4096
     interpreter.loop = True