Skip to content

Commit a4a3faf

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents ea93bef + 396fea6 commit a4a3faf

22 files changed

+3607
-2805
lines changed

interpreter/core/archived_server.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import asyncio
2+
import json
3+
from typing import Generator
4+
5+
from .utils.lazy_import import lazy_import
6+
7+
uvicorn = lazy_import("uvicorn")
8+
fastapi = lazy_import("fastapi")
9+
10+
11+
def server(interpreter, host="0.0.0.0", port=8000):
12+
FastAPI, Request, Response, WebSocket = (
13+
fastapi.FastAPI,
14+
fastapi.Request,
15+
fastapi.Response,
16+
fastapi.WebSocket,
17+
)
18+
PlainTextResponse = fastapi.responses.PlainTextResponse
19+
20+
app = FastAPI()
21+
22+
@app.post("/chat")
23+
async def stream_endpoint(request: Request) -> Response:
24+
async def event_stream() -> Generator[str, None, None]:
25+
data = await request.json()
26+
for response in interpreter.chat(message=data["message"], stream=True):
27+
yield response
28+
29+
return Response(event_stream(), media_type="text/event-stream")
30+
31+
# Post endpoint
32+
# @app.post("/iv0", response_class=PlainTextResponse)
33+
# async def i_post_endpoint(request: Request):
34+
# message = await request.body()
35+
# message = message.decode("utf-8") # Convert bytes to string
36+
37+
# async def event_stream() -> Generator[str, None, None]:
38+
# for response in interpreter.chat(
39+
# message=message, stream=True, display=False
40+
# ):
41+
# if (
42+
# response.get("type") == "message"
43+
# and response["role"] == "assistant"
44+
# and "content" in response
45+
# ):
46+
# yield response["content"] + "\n"
47+
# if (
48+
# response.get("type") == "message"
49+
# and response["role"] == "assistant"
50+
# and response.get("end") == True
51+
# ):
52+
# yield " \n"
53+
54+
# return StreamingResponse(event_stream(), media_type="text/plain")
55+
56+
@app.get("/test")
57+
async def test_ui():
58+
return PlainTextResponse(
59+
"""
60+
<!DOCTYPE html>
61+
<html>
62+
<head>
63+
<title>Chat</title>
64+
</head>
65+
<body>
66+
<form action="" onsubmit="sendMessage(event)">
67+
<textarea id="messageInput" rows="10" cols="50" autocomplete="off"></textarea>
68+
<button>Send</button>
69+
</form>
70+
<div id="messages"></div>
71+
<script>
72+
var ws = new WebSocket("ws://localhost:8000/");
73+
var lastMessageElement = null;
74+
ws.onmessage = function(event) {
75+
if (lastMessageElement == null) {
76+
lastMessageElement = document.createElement('p');
77+
document.getElementById('messages').appendChild(lastMessageElement);
78+
}
79+
lastMessageElement.innerHTML += event.data;
80+
};
81+
function sendMessage(event) {
82+
event.preventDefault();
83+
var input = document.getElementById("messageInput");
84+
var message = input.value;
85+
if (message.startsWith('{') && message.endsWith('}')) {
86+
message = JSON.stringify(JSON.parse(message));
87+
}
88+
ws.send(message);
89+
var userMessageElement = document.createElement('p');
90+
userMessageElement.innerHTML = '<b>' + input.value + '</b><br>';
91+
document.getElementById('messages').appendChild(userMessageElement);
92+
lastMessageElement = document.createElement('p');
93+
document.getElementById('messages').appendChild(lastMessageElement);
94+
input.value = '';
95+
}
96+
</script>
97+
</body>
98+
</html>
99+
""",
100+
media_type="text/html",
101+
)
102+
103+
@app.websocket("/")
104+
async def i_test(websocket: WebSocket):
105+
await websocket.accept()
106+
while True:
107+
data = await websocket.receive_text()
108+
while data.strip().lower() != "stop": # Stop command
109+
task = asyncio.create_task(websocket.receive_text())
110+
111+
# This would be terrible for production. Just for testing.
112+
try:
113+
data_dict = json.loads(data)
114+
if set(data_dict.keys()) == {"role", "content", "type"} or set(
115+
data_dict.keys()
116+
) == {"role", "content", "type", "format"}:
117+
data = data_dict
118+
except json.JSONDecodeError:
119+
pass
120+
121+
for response in interpreter.chat(
122+
message=data, stream=True, display=False
123+
):
124+
if task.done():
125+
data = task.result() # Get the new message
126+
break # Break the loop and start processing the new message
127+
# Send out assistant message chunks
128+
if (
129+
response.get("type") == "message"
130+
and response["role"] == "assistant"
131+
and "content" in response
132+
):
133+
await websocket.send_text(response["content"])
134+
await asyncio.sleep(0.01) # Add a small delay
135+
if (
136+
response.get("type") == "message"
137+
and response["role"] == "assistant"
138+
and response.get("end") == True
139+
):
140+
await websocket.send_text("\n")
141+
await asyncio.sleep(0.01) # Add a small delay
142+
if not task.done():
143+
data = (
144+
await task
145+
) # Wait for the next message if it hasn't arrived yet
146+
147+
print(
148+
"\nOpening a simple `interpreter.chat(data)` POST endpoint at http://localhost:8000/chat."
149+
)
150+
print(
151+
"Opening an `i.protocol` compatible WebSocket endpoint at http://localhost:8000/."
152+
)
153+
print("\nVisit http://localhost:8000/test to test the WebSocket endpoint.\n")
154+
155+
import socket
156+
157+
hostname = socket.gethostname()
158+
local_ip = socket.gethostbyname(hostname)
159+
local_url = f"http://{local_ip}:8000"
160+
print(f"Local URL: {local_url}\n")
161+
162+
uvicorn.run(app, host=host, port=port)

interpreter/core/computer/ai/ai.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,22 @@ class Ai:
117117
def __init__(self, computer):
118118
self.computer = computer
119119

120+
def chat(self, text):
121+
old_messages = self.computer.interpreter.llm.interpreter.messages
122+
old_system_message = self.computer.interpreter.llm.interpreter.system_message
123+
try:
124+
self.computer.interpreter.llm.interpreter.system_message = (
125+
"You are an AI assistant."
126+
)
127+
self.computer.interpreter.llm.interpreter.messages = []
128+
response = self.computer.interpreter.llm.interpreter.chat(text)
129+
finally:
130+
self.computer.interpreter.llm.interpreter.messages = old_messages
131+
self.computer.interpreter.llm.interpreter.system_message = (
132+
old_system_message
133+
)
134+
return response[-1].get("content")
135+
120136
def query(self, text, query, custom_reduce_query=None):
121137
if custom_reduce_query == None:
122138
custom_reduce_query = query

interpreter/core/computer/display/display.py

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -94,26 +94,31 @@ def screenshot(
9494
:param screen: specify which display; 0 for primary and 1 and above for secondary.
9595
:param combine_screens: If True, a collage of all display screens will be returned. Otherwise, a list of display screens will be returned.
9696
"""
97-
if not self.computer.emit_images and force_image == False:
98-
screenshot = self.screenshot(show=False, force_image=True)
9997

100-
description = self.computer.vision.query(pil_image=screenshot)
101-
print("A DESCRIPTION OF WHAT'S ON THE SCREEN: " + description)
102-
103-
if self.computer.max_output > 600:
104-
print("ALL OF THE TEXT ON THE SCREEN: ")
105-
text = self.get_text_as_list_of_lists(screenshot=screenshot)
106-
pp = pprint.PrettyPrinter(indent=4)
107-
pretty_text = pp.pformat(text) # language models like it pretty!
108-
pretty_text = format_to_recipient(pretty_text, "assistant")
109-
print(pretty_text)
110-
print(
111-
format_to_recipient(
112-
"To receive the text above as a Python object, run computer.display.get_text_as_list_of_lists()",
113-
"assistant",
114-
)
115-
)
116-
return screenshot # Still return a PIL image
98+
# Since Local II, all images sent to local models will be rendered to text with moondream and pytesseract.
99+
# So we don't need to do this here— we can just emit images.
100+
# We should probably remove self.computer.emit_images for this reason.
101+
102+
# if not self.computer.emit_images and force_image == False:
103+
# screenshot = self.screenshot(show=False, force_image=True)
104+
105+
# description = self.computer.vision.query(pil_image=screenshot)
106+
# print("A DESCRIPTION OF WHAT'S ON THE SCREEN: " + description)
107+
108+
# if self.computer.max_output > 600:
109+
# print("ALL OF THE TEXT ON THE SCREEN: ")
110+
# text = self.get_text_as_list_of_lists(screenshot=screenshot)
111+
# pp = pprint.PrettyPrinter(indent=4)
112+
# pretty_text = pp.pformat(text) # language models like it pretty!
113+
# pretty_text = format_to_recipient(pretty_text, "assistant")
114+
# print(pretty_text)
115+
# print(
116+
# format_to_recipient(
117+
# "To recieve the text above as a Python object, run computer.display.get_text_as_list_of_lists()",
118+
# "assistant",
119+
# )
120+
# )
121+
# return screenshot # Still return a PIL image
117122

118123
if quadrant == None:
119124
if active_app_only:

interpreter/core/computer/utils/computer_vision.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,7 @@
1111

1212

1313
def pytesseract_get_text(img):
14-
# Convert PIL Image to NumPy array
15-
img_array = np.array(img)
16-
17-
# Convert the image to grayscale
18-
gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
19-
20-
# Use pytesseract to get the text from the image
21-
text = pytesseract.image_to_string(gray)
22-
23-
return text
14+
return pytesseract.image_to_string(img)
2415

2516

2617
def pytesseract_get_text_bounding_boxes(img):

interpreter/core/computer/vision/vision.py

Lines changed: 83 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
import base64
2+
import contextlib
23
import io
4+
import os
5+
import tempfile
36

47
from PIL import Image
58

69
from ...utils.lazy_import import lazy_import
10+
from ..utils.computer_vision import pytesseract_get_text
711

812
# transformers = lazy_import("transformers") # Doesn't work for some reason! We import it later.
913

@@ -17,21 +21,79 @@ def __init__(self, computer):
1721
def load(self):
1822
import transformers # Wait until we use it. Transformers can't be lazy loaded for some reason!
1923

20-
print(
21-
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
22-
)
23-
print(
24-
"Alternatively, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
25-
)
24+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
25+
26+
if self.computer.debug:
27+
print(
28+
"Open Interpreter will use Moondream (tiny vision model) to describe images to the language model. Set `interpreter.llm.vision_renderer = None` to disable this behavior."
29+
)
30+
print(
31+
"Alternativley, you can use a vision-supporting LLM and set `interpreter.llm.supports_vision = True`."
32+
)
2633
model_id = "vikhyatk/moondream2"
2734
revision = "2024-04-02"
35+
print("loading model")
36+
2837
self.model = transformers.AutoModelForCausalLM.from_pretrained(
2938
model_id, trust_remote_code=True, revision=revision
3039
)
3140
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
3241
model_id, revision=revision
3342
)
3443

44+
def ocr(
45+
self,
46+
base_64=None,
47+
path=None,
48+
lmc=None,
49+
pil_image=None,
50+
):
51+
"""
52+
Gets OCR of image.
53+
"""
54+
55+
if lmc:
56+
if "base64" in lmc["format"]:
57+
# # Extract the extension from the format, default to 'png' if not specified
58+
# if "." in lmc["format"]:
59+
# extension = lmc["format"].split(".")[-1]
60+
# else:
61+
# extension = "png"
62+
# Save the base64 content as a temporary file
63+
img_data = base64.b64decode(lmc["content"])
64+
with tempfile.NamedTemporaryFile(
65+
delete=False, suffix=".png"
66+
) as temp_file:
67+
temp_file.write(img_data)
68+
temp_file_path = temp_file.name
69+
70+
# Set path to the path of the temporary file
71+
path = temp_file_path
72+
73+
elif lmc["format"] == "path":
74+
# Convert to base64
75+
path = lmc["content"]
76+
elif base_64:
77+
# Save the base64 content as a temporary file
78+
img_data = base64.b64decode(base_64)
79+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
80+
temp_file.write(img_data)
81+
temp_file_path = temp_file.name
82+
83+
# Set path to the path of the temporary file
84+
path = temp_file_path
85+
elif path:
86+
pass
87+
elif pil_image:
88+
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
89+
pil_image.save(temp_file, format="PNG")
90+
temp_file_path = temp_file.name
91+
92+
# Set path to the path of the temporary file
93+
path = temp_file_path
94+
95+
return pytesseract_get_text(path)
96+
3597
def query(
3698
self,
3799
query="Describe this image.",
@@ -45,7 +107,16 @@ def query(
45107
"""
46108

47109
if self.model == None and self.tokenizer == None:
48-
self.load()
110+
try:
111+
with contextlib.redirect_stdout(
112+
open(os.devnull, "w")
113+
), contextlib.redirect_stderr(open(os.devnull, "w")):
114+
self.load()
115+
except ImportError:
116+
self.computer.interpreter.display_message(
117+
"\nTo use local vision, run `pip install 'open-interpreter[local]'`.\n"
118+
)
119+
return ""
49120

50121
if lmc:
51122
if "base64" in lmc["format"]:
@@ -71,5 +142,8 @@ def query(
71142
elif pil_image:
72143
img = pil_image
73144

74-
enc_image = self.model.encode_image(img)
75-
return self.model.answer_question(enc_image, query, self.tokenizer)
145+
with contextlib.redirect_stdout(open(os.devnull, "w")):
146+
enc_image = self.model.encode_image(img)
147+
answer = self.model.answer_question(enc_image, query, self.tokenizer)
148+
149+
return answer

0 commit comments

Comments
 (0)