Skip to content

Commit 8c5e504

Browse files
committed
Better skills, better web browser
1 parent 12a09d3 commit 8c5e504

File tree

12 files changed

+797
-32
lines changed

12 files changed

+797
-32
lines changed

interpreter/core/async_core.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
from .core import OpenInterpreter
1818

19+
last_start_time = 0
20+
1921
try:
2022
import janus
2123
import uvicorn
@@ -763,10 +765,9 @@ async def openai_compatible_generator():
763765

764766
@router.post("/openai/chat/completions")
765767
async def chat_completion(request: ChatCompletionRequest):
766-
# Convert to LMC
767-
768-
async_interpreter.stop_event.set()
768+
global last_start_time
769769

770+
# Convert to LMC
770771
last_message = request.messages[-1]
771772

772773
if last_message.role != "user":
@@ -776,11 +777,11 @@ async def chat_completion(request: ChatCompletionRequest):
776777
# Handle special STOP token
777778
return
778779

779-
if last_message.content == "{CONTEXT_MODE_ON}":
780+
if last_message.content in ["{CONTEXT_MODE_ON}", "{REQUIRE_START_ON}"]:
780781
async_interpreter.context_mode = True
781782
return
782783

783-
if last_message.content == "{CONTEXT_MODE_OFF}":
784+
if last_message.content in ["{CONTEXT_MODE_OFF}", "{REQUIRE_START_OFF}"]:
784785
async_interpreter.context_mode = False
785786
return
786787

@@ -826,12 +827,30 @@ async def chat_completion(request: ChatCompletionRequest):
826827
if async_interpreter.context_mode:
827828
# In context mode, we only respond if we recieved a {START} message
828829
# Otherwise, we're just accumulating context
829-
if last_message.content != "{START}":
830-
return
831-
if async_interpreter.messages[-1]["content"] == "{START}":
830+
if last_message.content == "{START}":
831+
if async_interpreter.messages[-1]["content"] == "{START}":
832+
# Remove that {START} message that would have just been added
833+
async_interpreter.messages = async_interpreter.messages[:-1]
834+
last_start_time = time.time()
835+
else:
836+
# Check if we're within 6 seconds of last_start_time
837+
current_time = time.time()
838+
if current_time - last_start_time <= 6:
839+
# Continue processing
840+
pass
841+
else:
842+
# More than 6 seconds have passed, so return
843+
return
844+
845+
else:
846+
if last_message.content == "{START}":
847+
# This just sometimes happens I guess
832848
# Remove that {START} message that would have just been added
833849
async_interpreter.messages = async_interpreter.messages[:-1]
850+
return
834851

852+
async_interpreter.stop_event.set()
853+
time.sleep(0.1)
835854
async_interpreter.stop_event.clear()
836855

837856
if request.stream:

interpreter/core/computer/ai/ai.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ class Ai:
117117
def __init__(self, computer):
118118
self.computer = computer
119119

120-
def chat(self, text):
120+
def chat(self, text, base64=None):
121121
messages = [
122122
{
123123
"role": "system",
@@ -126,6 +126,10 @@ def chat(self, text):
126126
},
127127
{"role": "user", "type": "message", "content": text},
128128
]
129+
if base64:
130+
messages.append(
131+
{"role": "user", "type": "image", "format": "base64", "content": base64}
132+
)
129133
response = ""
130134
for chunk in self.computer.interpreter.llm.run(messages):
131135
if "content" in chunk:

interpreter/core/computer/browser/browser.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,28 @@
1+
import time
2+
3+
import html2text
14
import requests
5+
from selenium import webdriver
6+
from selenium.webdriver.chrome.service import Service
7+
from selenium.webdriver.common.by import By
8+
from selenium.webdriver.common.keys import Keys
9+
from webdriver_manager.chrome import ChromeDriverManager
210

311

412
class Browser:
513
def __init__(self, computer):
614
self.computer = computer
15+
self._driver = None
16+
17+
@property
18+
def driver(self):
19+
if self._driver is None:
20+
self.setup()
21+
return self._driver
22+
23+
@driver.setter
24+
def driver(self, value):
25+
self._driver = value
726

827
def search(self, query):
928
"""
@@ -14,3 +33,89 @@ def search(self, query):
1433
params={"query": query},
1534
)
1635
return response.json()["result"]
36+
37+
def setup(self):
38+
self.service = Service(ChromeDriverManager().install())
39+
self.options = webdriver.ChromeOptions()
40+
self._driver = webdriver.Chrome(service=self.service, options=self.options)
41+
42+
def go_to_url(self, url):
43+
"""Navigate to a URL"""
44+
self.driver.get(url)
45+
time.sleep(3)
46+
47+
def search_google(self, query):
48+
"""Perform a Google search"""
49+
self.driver.get("https://www.perplexity.ai")
50+
# search_box = self.driver.find_element(By.NAME, 'q')
51+
# search_box.send_keys(query)
52+
# search_box.send_keys(Keys.RETURN)
53+
body = self.driver.find_element(By.TAG_NAME, "body")
54+
body.send_keys(Keys.COMMAND + "k")
55+
time.sleep(0.5)
56+
active_element = self.driver.switch_to.active_element
57+
active_element.send_keys(query)
58+
active_element.send_keys(Keys.RETURN)
59+
time.sleep(5)
60+
61+
def analyze_page(self, intent):
62+
"""Extract HTML, list interactive elements, and analyze with AI"""
63+
html_content = self.driver.page_source
64+
text_content = html2text.html2text(html_content)
65+
66+
elements = (
67+
self.driver.find_elements(By.TAG_NAME, "a")
68+
+ self.driver.find_elements(By.TAG_NAME, "button")
69+
+ self.driver.find_elements(By.TAG_NAME, "input")
70+
+ self.driver.find_elements(By.TAG_NAME, "select")
71+
)
72+
73+
elements_info = [
74+
{
75+
"id": idx,
76+
"text": elem.text,
77+
"attributes": elem.get_attribute("outerHTML"),
78+
}
79+
for idx, elem in enumerate(elements)
80+
]
81+
82+
ai_query = f"""
83+
Below is the content of the current webpage along with interactive elements.
84+
Given the intent "{intent}", please extract useful information and provide sufficient details
85+
about interactive elements, focusing especially on those pertinent to the provided intent.
86+
87+
If the information requested by the intent "{intent}" is present on the page, simply return that.
88+
89+
If not, return the top 10 most relevant interactive elements in a concise, actionable format, listing them on separate lines
90+
with their ID, a description, and their possible action.
91+
92+
Do not hallucinate.
93+
94+
Page Content:
95+
{text_content}
96+
97+
Interactive Elements:
98+
{elements_info}
99+
"""
100+
101+
# response = self.computer.ai.chat(ai_query)
102+
103+
# screenshot = self.driver.get_screenshot_as_base64()
104+
# old_model = self.computer.interpreter.llm.model
105+
# self.computer.interpreter.llm.model = "gpt-4o-mini"
106+
# response = self.computer.ai.chat(ai_query, base64=screenshot)
107+
# self.computer.interpreter.llm.model = old_model
108+
109+
old_model = self.computer.interpreter.llm.model
110+
self.computer.interpreter.llm.model = "gpt-4o-mini"
111+
response = self.computer.ai.chat(ai_query)
112+
self.computer.interpreter.llm.model = old_model
113+
114+
print(response)
115+
print(
116+
"Please now utilize this information or interact with the interactive elements provided to answer the user's query."
117+
)
118+
119+
def quit(self):
120+
"""Close the browser"""
121+
self.driver.quit()

interpreter/core/computer/display/display.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,23 @@ def info(self):
7474
"""
7575
return get_displays()
7676

77-
def view(self, show=True, quadrant=None, screen=0, combine_screens=True):
77+
def view(
78+
self,
79+
show=True,
80+
quadrant=None,
81+
screen=0,
82+
combine_screens=True,
83+
active_app_only=True,
84+
):
7885
"""
7986
Redirects to self.screenshot
8087
"""
8188
return self.screenshot(
82-
screen=screen, show=show, quadrant=quadrant, combine_screens=combine_screens
89+
screen=screen,
90+
show=show,
91+
quadrant=quadrant,
92+
combine_screens=combine_screens,
93+
active_app_only=active_app_only,
8394
)
8495

8596
# def get_active_window(self):
@@ -149,7 +160,7 @@ def screenshot(
149160
screen=screen, combine_screens=combine_screens
150161
) # this function uses pyautogui.screenshot which works fine for all OS (mac, linux and windows)
151162
message = format_to_recipient(
152-
"Taking a screenshot of the entire screen.\n\nTo focus on the active app, use computer.view(active_app_only=True).",
163+
"Taking a screenshot of the entire screen.\n\nTo focus on the active app, use computer.display.view(active_app_only=True).",
153164
"assistant",
154165
)
155166
print(message)

interpreter/core/computer/keyboard/keyboard.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@ def press(self, *args, presses=1, interval=0.1):
6868
pyautogui.press(keys, presses=presses, interval=interval)
6969
time.sleep(0.15)
7070

71+
def press_and_release(self, *args, presses=1, interval=0.1):
72+
"""
73+
Press and release a key or a sequence of keys.
74+
75+
This method is a perfect proxy for the press method.
76+
"""
77+
return self.press(*args, presses=presses, interval=interval)
78+
7179
def hotkey(self, *args, interval=0.1):
7280
"""
7381
Press a sequence of keys in the order they are provided, and then release them in reverse order.

interpreter/core/computer/skills/skills.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,14 @@ def run(self, skill):
4040
)
4141

4242
def search(self, query):
43-
return aifs.search(query, self.path, python_docstrings_only=True)
43+
"""
44+
This just lists all for now.
45+
"""
46+
return [
47+
file.replace(".py", "()")
48+
for file in os.listdir(self.path)
49+
if file.endswith(".py")
50+
]
4451

4552
def import_skills(self):
4653
previous_save_skills_setting = self.computer.save_skills

interpreter/core/computer/terminal/languages/jupyter_language.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,21 +92,21 @@ def run(self, code):
9292
### OFFICIAL OPEN INTERPRETER GOVERNMENT ISSUE SKILL LIBRARY ###
9393
################################################################
9494

95-
try:
96-
functions = string_to_python(code)
97-
except:
98-
# Non blocking
99-
functions = {}
95+
# try:
96+
# functions = string_to_python(code)
97+
# except:
98+
# # Non blocking
99+
# functions = {}
100100

101-
if self.computer.save_skills and functions:
102-
skill_library_path = self.computer.skills.path
101+
# if self.computer.save_skills and functions:
102+
# skill_library_path = self.computer.skills.path
103103

104-
if not os.path.exists(skill_library_path):
105-
os.makedirs(skill_library_path)
104+
# if not os.path.exists(skill_library_path):
105+
# os.makedirs(skill_library_path)
106106

107-
for filename, function_code in functions.items():
108-
with open(f"{skill_library_path}/{filename}.py", "w") as file:
109-
file.write(function_code)
107+
# for filename, function_code in functions.items():
108+
# with open(f"{skill_library_path}/{filename}.py", "w") as file:
109+
# file.write(function_code)
110110

111111
self.finish_flag = False
112112
try:

interpreter/core/respond.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,12 +308,14 @@ def respond(interpreter):
308308
code = re.sub(r"import computer\.\w+\n", "pass\n", code)
309309
# If it does this it sees the screenshot twice (which is expected jupyter behavior)
310310
if any(
311-
code.strip().split("\n")[-1].startswith(text)
312-
for text in [
313-
"computer.display.view",
314-
"computer.display.screenshot",
315-
"computer.view",
316-
"computer.screenshot",
311+
[
312+
code.strip().split("\n")[-1].startswith(text)
313+
for text in [
314+
"computer.display.view",
315+
"computer.display.screenshot",
316+
"computer.view",
317+
"computer.screenshot",
318+
]
317319
]
318320
):
319321
code = code + "\npass"

0 commit comments

Comments
 (0)