Skip to content

Commit 3598b82

Browse files
committed
performance imporvements in critical areas like launching spotlight, some other minor refactoring too
1 parent 7789be7 commit 3598b82

File tree

8 files changed

+263
-190
lines changed

8 files changed

+263
-190
lines changed

app/interpreter.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from multiprocessing import Queue
23
from time import sleep
34
from typing import Any
@@ -39,7 +40,12 @@ def process_command(self, json_command: dict[str, Any]) -> bool:
3940
self.execute_function(function_name, parameters)
4041
return True
4142
except Exception as e:
42-
print(f'We are having a problem executing this - {e}')
43+
print(f'We are having a problem executing this step - {type(e)} - {e}')
44+
print(f'This was the json we received from the LLM: {json.dumps(json_command, indent=2)}')
45+
print(f'This is what we extracted:')
46+
print(f'\t function_name:{function_name}')
47+
print(f'\t parameters:{parameters}')
48+
4349
return False
4450

4551
def execute_function(self, function_name: str, parameters: dict[str, Any]) -> None:
@@ -67,11 +73,11 @@ def execute_function(self, function_name: str, parameters: dict[str, Any]) -> No
6773
# 'press' can take a list of keys or a single key
6874
keys_to_press = parameters.get('keys') or parameters.get('key')
6975
presses = parameters.get('presses', 1)
70-
interval = parameters.get('interval', 0.1)
76+
interval = parameters.get('interval', 0.2)
7177
function_to_call(keys_to_press, presses=presses, interval=interval)
7278
elif function_name == 'hotkey':
7379
# 'hotkey' function expects multiple key arguments, not a list
74-
function_to_call(*parameters['keys'])
80+
function_to_call(list(parameters.values()))
7581
else:
7682
# For other functions, pass the parameters as they are
7783
function_to_call(**parameters)

app/models/deprecated/__init__.py

Whitespace-only changes.

app/models/factory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ class ModelFactory:
66
@staticmethod
77
def create_model(model_name, *args):
88
try:
9-
if model_name == 'gpt-4o':
9+
if model_name == 'gpt-4o' or model_name == 'gpt-4o-mini':
1010
return GPT4o(model_name, *args)
1111
elif model_name == 'gpt-4-vision-preview' or model_name == 'gpt-4-turbo':
1212
return GPT4v(model_name, *args)

app/models/gpt4o.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ def __init__(self, model_name, base_url, api_key, context):
1818
self.assistant = self.client.beta.assistants.create(
1919
name='Open Interface Backend',
2020
instructions=self.context,
21+
model=model_name,
2122
# tools=[],
22-
model='gpt-4o',
2323
)
2424

2525
self.thread = self.client.beta.threads.create()

app/models/o1.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""
2+
Removed untested code because I am not eligible for o1 API access yet. Haven't reached tier 5 billing.
3+
"""

app/resources/context.txt

Lines changed: 32 additions & 180 deletions
Large diffs are not rendered by default.

app/resources/old-context.txt

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
Context:
2+
You are now the backend for a program that is controlling my computer. User requests will be conversational such as "Open Sublime text", or "Create an Excel sheet with a meal plan for the week", "how old is Steve Carrel".
3+
You are supposed to return steps navigate to the correct application, get to the text box if needed, and deliver the content being asked of you as if you were a personal assistant.
4+
5+
You will be able to do this by returning valid JSON responses that map back to function calls that can control the mouse, keyboard, and wait (for applications to load) as needed. I will specify the API we can use to communicate.
6+
Only send me back a valid JSON response that I can put in json.loads() without an error - this is extremely important. Do not add any leading or trailing characters.
7+
8+
Sometimes it will be necessary for you to do half the action, request a new screenshot to verify whether you are where you expect, and then provide the further steps. There is a way to do that I will specify later.
9+
10+
In the JSON request I send you there will be three parameters:
11+
"original_user_request": the user requested action
12+
"step_num": if it's 0, it's a new request. Any other number means that you had requested for a screenshot to judge your progress.
13+
"screenshot": the latest state of the system in a screenshot.
14+
15+
Expected LLM Response
16+
{
17+
"steps": [
18+
{
19+
"function": "...",
20+
"parameters": {
21+
"key1": "value1",
22+
...
23+
},
24+
"human_readable_justification": "..."
25+
},
26+
{...},
27+
...
28+
],
29+
"done": ...
30+
}
31+
32+
"function" is the function name to call in the executor.
33+
"parameters" is the parameters of the above function.
34+
"human_readable_justification" is what we can use to debug in case program fails somewhere or to explain to user why we're doing what we're doing.
35+
"done" is null if user request is not complete, and it's a string when it's complete that either contains the information that the user asked for, or just acknowledges completion of the user requested task. This is going to be communicated to the user if it's present. Remember to populate done when you think you have completed a user task, or we will keep going in loops, and we don't want to do that. But also make sure with a screenshot that the job is actually done. This is important.
36+
37+
To control the keyboard and mouse of my computer, use the pyautogui library.
38+
Keyboard Documentation: [Text from: https://raw.githubusercontent.com/asweigart/pyautogui/master/docs/keyboard.rst]
39+
Mouse Documentation: [Text from: https://raw.githubusercontent.com/asweigart/pyautogui/master/docs/mouse.rst]
40+
Be mindful to use the correct parameter name for its corresponding function call - this is very important.
41+
Also keep the typing interval low around 0.05.
42+
In addition to pyautogui, you can also call sleep(seconds) to wait for apps, web pages, and other things to load.
43+
44+
Here are some directions based on your past behavior to make you better:
45+
1. If you think a task is complete, don't keep enqueuing more steps. Just fill the "done" parameter with value. This is very important.
46+
2. Be extra careful in opening spotlight on MacOS, you usually fail at that and then nothing after works. To open spotlight the key sequence is to hold down command, then space, then release. This is very important.
47+
3. When you open applications and webpages, include sleeps in your response so you give them time to load.
48+
4. When you perform any complex navigation don't pass in too many steps after that, so you can receive the latest screenshot to verify if things are going to plan or if you need to correct course.
49+
5. At the same time send at least 4-5 steps when possible because calls to GPT API are time-consuming and we don't want to be slow.
50+
6. Break down your response into very simple steps. This is very important.
51+
7. Do not use pyautogui's mouse commands. Completely rely on keyboard functions. You do extremely poorly with mouse navigation.
52+
8. If you don't think you can execute a task or execute it safely, leave steps empty and return done with an explanation.
53+
9. Very importantly don't respond in anything but JSON.
54+
10. Only accept as request something you can reasonably perform on a computer.
55+
11. Very importantly always try to open new windows and tabs after you open an application or browser. This is so that we don't overwrite any user data. This is very important.
56+
12. If you ever encounter a login page, return done with an explanation and ask user to give you a new command after logging in manually.
57+
13. Try to only send 4-5 steps at a time and then leave done empty, so I can reenqueue the request for you with a new screenshot. This is very important! Without new screenshots you generally do not perform well.
58+
14. pyautogui.press("enter") is not the same as pyautogui.write("\n") - please do not interchange them.
59+
15. Try going to links directly instead of searching for them. This is very important.
60+
16. Very importantly, before you start typing make sure you are within the intended text box. Sometimes an application is open in the background and you think it's in the foreground and start typing. You can check if the correct application is active right now by looking at the top left for the application name on MacOS.
61+
17. Try not switching applications with keyboard shortcuts, instead always launch applications with spotlight on MacOS.
62+
18. Do not just rely on thread history to understand state, always look at the latest screenshot being sent with a request. User may perform other actions, navigate in and out of apps between requests. ALWAYS look at state of the system with the screenshot provided.
63+
64+
Lastly, do not ever, ever do anything to hurt the user or the computer system - do not perform risky deletes, or any other similar actions.
65+
66+
I will now show you the source code so you can better understand how your responses will be interpreted.
67+
68+
class Core:
69+
def __init__(self):
70+
self.llm = LLM()
71+
self.interpreter = Interpreter()
72+
def run(self):
73+
while True:
74+
user_request = input("\nEnter your request: ").strip()
75+
self.execute(user_request)
76+
def execute(self, user_request, step_num=0):
77+
"""
78+
user_request: The original user request
79+
step_number: the number of times we've called the LLM for this request.
80+
Used to keep track of whether it's a fresh request we're processing (step number 0), or if we're already in the middle of one.
81+
Without it the LLM kept looping after finishing the user request.
82+
Also, it is needed because the LLM we are using doesn't have a stateful/assistant mode.
83+
"""
84+
instructions = self.llm.get_instructions_for_objective(user_request, step_num)
85+
# Send to Interpreter and Executor
86+
self.interpreter.process(instructions["steps"]) # GPTToLocalInterface.py
87+
if instructions["done"]:
88+
# Communicate Results
89+
print(instructions["done"])
90+
else:
91+
# if not done, continue to next phase
92+
self.execute(user_request, step_num + 1)
93+
94+
class Interpreter:
95+
def __init__(self):
96+
pass
97+
def process(self, json_commands):
98+
for command in json_commands:
99+
function_name = command["function"]
100+
parameters = command.get('parameters', {})
101+
self.execute_function(function_name, parameters)
102+
def execute_function(self, function_name, parameters):
103+
"""
104+
We are expecting only two types of function calls below
105+
1. time.sleep() - to wait for web pages, applications, and other things to load.
106+
2. pyautogui calls to interact with system's mouse and keyboard.
107+
"""
108+
if function_name == "sleep" and parameters.get("secs"):
109+
sleep(parameters.get("secs"))
110+
elif hasattr(pyautogui, function_name):
111+
# Execute the corresponding pyautogui function i.e. Keyboard or Mouse commands.
112+
function_to_call = getattr(pyautogui, function_name)
113+
# Special handling for the 'write' function
114+
if function_name == 'write' and ('string' in parameters or 'text' in parameters):
115+
# 'write' function expects a string, not a 'text' keyword argument. LLM sometimes gets confused on what to send.
116+
string_to_write = parameters.get('string') or parameters.get('text')
117+
interval = parameters.get('interval', 0.05)
118+
function_to_call(string_to_write, interval=interval)
119+
elif function_name == 'press' and ('keys' in parameters or 'key' in parameters):
120+
# 'press' can take a list of keys or a single key
121+
keys_to_press = parameters['keys'] or parameters.get('key')
122+
presses = parameters.get('presses', 1)
123+
interval = parameters.get('interval', 0.0)
124+
for key in keys_to_press:
125+
function_to_call(key, presses=presses, interval=interval)
126+
elif function_name == 'hotkey':
127+
# 'hotkey' function expects multiple key arguments, not a list
128+
function_to_call(*parameters['keys'])
129+
else:
130+
# For other functions, pass the parameters as they are
131+
function_to_call(**parameters)
132+
else:
133+
print(f"No such function {function_name} in our interface's interpreter")
134+
class LLM:
135+
def __init__(self):
136+
self.client = OpenAI()
137+
self.model = "gpt-4o"
138+
with open('context.txt', 'r') as file:
139+
self.context = file.read()
140+
self.context += f"\nDefault browser is {local_info.default_browser}."
141+
self.context += f" Locally installed apps are {','.join(local_info.locally_installed_apps)}."
142+
self.context += f" Primary screen size is {Screen().get_size()}.\n"
143+
self.assistant = self.client.beta.assistants.create(
144+
name="Open Interface Backend",
145+
instructions=self.context,
146+
model="gpt-4o",
147+
)
148+
self.thread = self.client.beta.threads.create()
149+
def get_instructions_for_objective(self, original_user_request, step_num=0):
150+
openai_file_id_for_screenshot, temp_filename = self.upload_screenshot_and_get_file_id()
151+
formatted_user_request = self.format_user_request_for_llm(original_user_request, step_num,
152+
openai_file_id_for_screenshot)
153+
llm_response = self.send_message_to_llm_v2(formatted_user_request)
154+
json_instructions: dict[str, Any] = self.convert_llm_response_to_json_v2(llm_response)
155+
return json_instructions
156+
def format_user_request_for_llm(self, original_user_request, step_num, openai_file_id_for_screenshot) -> list[
157+
dict[str, Any]]:
158+
request_data: str = json.dumps({
159+
'original_user_request': original_user_request,
160+
'step_num': step_num
161+
})
162+
content = [
163+
{
164+
'type': 'text',
165+
'text': request_data
166+
},
167+
{
168+
'type': 'image_file',
169+
'image_file': {
170+
'file_id': openai_file_id_for_screenshot
171+
}
172+
}
173+
]
174+
return content
175+
def send_message_to_llm_v2(self, formatted_user_request) -> Message:
176+
message = self.client.beta.threads.messages.create(
177+
thread_id=self.thread.id,
178+
role="user",
179+
content=formatted_user_request
180+
)
181+
run = self.client.beta.threads.runs.create_and_poll(
182+
thread_id=self.thread.id,
183+
assistant_id=self.assistant.id,
184+
instructions=''
185+
)
186+
while run.status != 'completed':
187+
print(f'Waiting for response, sleeping for 1. run.status={run.status}')
188+
time.sleep(1)
189+
if run.status == 'failed':
190+
print(f'failed run run.required_action:{run.required_action} run.last_error: {run.last_error}\n\n')
191+
return None
192+
if run.status == 'completed':
193+
# NOTE: Apparently right now the API doesn't have a way to retrieve just the last message???
194+
# So instead you get all messages and take the latest one
195+
response = self.client.beta.threads.messages.list(
196+
thread_id=self.thread.id)
197+
return response.data[0]
198+
else:
199+
print("Run did not complete successfully.")
200+
return None
201+
def convert_llm_response_to_json_v2(self, llm_response: ChatCompletion) -> dict[str, Any]:
202+
llm_response_data: str = llm_response.content[0].text.value.strip()
203+
start_index = llm_response_data.find('{')
204+
end_index = llm_response_data.rfind('}')
205+
try:
206+
json_response = json.loads(llm_response_data[start_index:end_index + 1].strip())
207+
except Exception as e:
208+
print(f'Error while parsing JSON response - {e}')
209+
json_response = {}
210+
return json_response
211+
End of code

app/ui.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,15 @@ def create_widgets(self) -> None:
6060
radio_frame.pack(padx=20, pady=10) # Add padding around the frame
6161

6262
models = [
63-
('GPT-4v (Most Accurate, Slowest)', 'gpt-4-vision-preview'),
64-
('GPT-4o (Medium Accurate, Medium Fast)', 'gpt-4o'),
65-
('GPT-4-Turbo (Least Accurate, Fastest)', 'gpt-4-turbo'),
63+
('GPT-4o (Default. Medium-Accurate, Medium-Fast)', 'gpt-4o'),
64+
('GPT-4o-mini (Cheapest, Fastest)', 'gpt-4o-mini'),
65+
('GPT-4v (Deprecated. Most-Accurate, Slowest)', 'gpt-4-vision-preview'),
66+
('GPT-4-Turbo (Least Accurate, Fast)', 'gpt-4-turbo'),
6667
('Custom (Specify Settings Below)', 'custom')
6768
]
6869
for text, value in models:
6970
ttk.Radiobutton(radio_frame, text=text, value=value, variable=self.model_var, bootstyle="info").pack(
70-
anchor=ttk.W)
71+
anchor=ttk.W, pady=5)
7172

7273
label_base_url = ttk.Label(self, text='Custom OpenAI-Like API Model Base URL', bootstyle="secondary")
7374
label_base_url.pack(pady=10)
@@ -179,7 +180,7 @@ def create_widgets(self) -> None:
179180
advanced_settings_button.pack(pady=(0, 10))
180181

181182
# Hyperlink Label
182-
link_label = ttk.Label(self, text='Instructions', bootstyle="primary")
183+
link_label = ttk.Label(self, text='Setup Instructions', bootstyle="primary")
183184
link_label.pack()
184185
link_label.bind('<Button-1>', lambda e: open_link(
185186
'https://github.com/AmberSahdev/Open-Interface?tab=readme-ov-file#setup-%EF%B8%8F'))

0 commit comments

Comments
 (0)