@@ -15,32 +15,32 @@ class LLM:
1515 """
1616 LLM Request
1717 {
18- "original_user_request": ...,
19- "step_num": ...,
20- "screenshot": ...
18+ "original_user_request": ...,
19+ "step_num": ...,
20+ "screenshot": ...
2121 }
2222
2323 step_num is the count of times we've interacted with the LLM for this user request.
2424 If it's 0, we know it's a fresh user request.
25- If it's greater than 0, then we know we are already in the middle of a request.
26- Therefore, if the number is positive and from the screenshot it looks like request is complete, then return an
27- empty list in steps and a string in done. Don't keep looping the same request.
25+ If it's greater than 0, then we know we are already in the middle of a request.
26+ Therefore, if the number is positive and from the screenshot it looks like request is complete, then return an
27+ empty list in steps and a string in done. Don't keep looping the same request.
2828
2929 Expected LLM Response
3030 {
31- "steps": [
32- {
33- "function": "...",
34- "parameters": {
35- "key1": "value1",
36- ...
37- },
38- "human_readable_justification": "..."
39- },
40- {...},
41- ...
42- ],
43- "done": ...
31+ "steps": [
32+ {
33+ "function": "...",
34+ "parameters": {
35+ "key1": "value1",
36+ ...
37+ },
38+ "human_readable_justification": "..."
39+ },
40+ {...},
41+ ...
42+ ],
43+ "done": ...
4444 }
4545
4646 function is the function name to call in the executor.
@@ -63,71 +63,57 @@ class LLM:
6363
6464 def __init__ (self ):
6565 settings_dict : dict [str , str ] = Settings ().get_dict ()
66- if "api_key" in settings_dict .keys () and settings_dict ["api_key" ]:
67- os .environ ["OPENAI_API_KEY" ] = settings_dict ["api_key" ]
68- base_url = "https://api.openai.com/v1/"
69- if "base_url" in settings_dict .keys () and settings_dict ["base_url" ]:
70- base_url = settings_dict ["base_url" ]
71- if not base_url .endswith ("/" ):
72- base_url += "/"
73- path_to_context_file = (
74- Path (__file__ ).resolve ().parent .joinpath ("resources" , "context.txt" )
75- )
76- with open (path_to_context_file , "r" ) as file :
66+
67+ base_url = settings_dict .get ('base_url' , 'https://api.openai.com/v1/' ).rstrip ('/' ) + '/'
68+ api_key = settings_dict .get ('api_key' )
69+ if api_key :
70+ os .environ ["OPENAI_API_KEY" ] = api_key
71+
72+ path_to_context_file = Path (__file__ ).resolve ().parent .joinpath ('resources' , 'context.txt' )
73+ with open (path_to_context_file , 'r' ) as file :
7774 self .context = file .read ()
7875
7976 self .context += f' Locally installed apps are { "," .join (local_info .locally_installed_apps )} .'
80- self .context += f" OS is { local_info .operating_system } ."
81- self .context += f" Primary screen size is { Screen ().get_size ()} .\n "
77+ self .context += f' OS is { local_info .operating_system } .'
78+ self .context += f' Primary screen size is { Screen ().get_size ()} .\n '
8279
83- if (
84- "default_browser" in settings_dict .keys ()
85- and settings_dict ["default_browser" ]
86- ):
80+ if 'default_browser' in settings_dict .keys () and settings_dict ['default_browser' ]:
8781 self .context += f'\n Default browser is { settings_dict ["default_browser" ]} .'
8882
89- if " custom_llm_instructions" in settings_dict :
90- self .context += (
91- f' \n Custom user-added info: { settings_dict [ "custom_llm_instructions" ] } .'
92- )
83+ if ' custom_llm_instructions' in settings_dict :
84+ self .context += f' \n Custom user-added info: { settings_dict [ "custom_llm_instructions" ] } .'
85+
86+ self . client = OpenAI ( )
9387
88+ self .model = settings_dict .get ('model' )
89+ if not self .model :
90+ self .model = 'gpt-4-vision-preview'
9491 self .client = OpenAI (api_key = os .environ ["OPENAI_API_KEY" ], base_url = base_url )
95- self .model = (
96- settings_dict ["model" ]
97- if "model" in settings_dict
98- else "gpt-4-vision-preview"
99- )
10092
101- def get_instructions_for_objective (
102- self , original_user_request : str , step_num : int = 0
103- ) -> dict [str , Any ]:
104- message : list [dict [str , Any ]] = self .create_message_for_llm (
105- original_user_request , step_num
106- )
93+ def get_instructions_for_objective (self , original_user_request : str , step_num : int = 0 ) -> dict [str , Any ]:
94+ message : list [dict [str , Any ]] = self .create_message_for_llm (original_user_request , step_num )
10795 llm_response = self .send_message_to_llm (message )
108- json_instructions : dict [str , Any ] = self .convert_llm_response_to_json (
109- llm_response
110- )
96+ json_instructions : dict [str , Any ] = self .convert_llm_response_to_json (llm_response )
11197
11298 return json_instructions
11399
114- def create_message_for_llm (
115- self , original_user_request , step_num
116- ) -> list [dict [str , Any ]]:
100+ def create_message_for_llm (self , original_user_request , step_num ) -> list [dict [str , Any ]]:
117101 base64_img : str = Screen ().get_screenshot_in_base64 ()
118102
119- request_data : str = json .dumps (
120- {"original_user_request" : original_user_request , "step_num" : step_num }
121- )
103+ request_data : str = json .dumps ({
104+ 'original_user_request' : original_user_request ,
105+ 'step_num' : step_num
106+ })
122107
123108 # We have to add context every request for now which is expensive because our chosen model doesn't have a
124109 # stateful/Assistant mode yet.
125110 message = [
126- {"type" : "text" , "text" : self .context + request_data },
127- {
128- "type" : "image_url" ,
129- "image_url" : {"url" : f"data:image/jpeg;base64,{ base64_img } " },
130- },
111+ {'type' : 'text' , 'text' : self .context + request_data },
112+ {'type' : 'image_url' ,
113+ 'image_url' : {
114+ 'url' : f'data:image/jpeg;base64,{ base64_img } '
115+ }
116+ }
131117 ]
132118
133119 return message
@@ -137,30 +123,26 @@ def send_message_to_llm(self, message) -> ChatCompletion:
137123 model = self .model ,
138124 messages = [
139125 {
140- " role" : " user" ,
141- " content" : message ,
126+ ' role' : ' user' ,
127+ ' content' : message ,
142128 }
143129 ],
144130 max_tokens = 800 ,
145131 )
146132 return response
147133
148- def convert_llm_response_to_json (
149- self , llm_response : ChatCompletion
150- ) -> dict [str , Any ]:
134+ def convert_llm_response_to_json (self , llm_response : ChatCompletion ) -> dict [str , Any ]:
151135 llm_response_data : str = llm_response .choices [0 ].message .content .strip ()
152136
153137 # Our current LLM model does not guarantee a JSON response hence we manually parse the JSON part of the response
154138 # Check for updates here - https://platform.openai.com/docs/guides/text-generation/json-mode
155- start_index = llm_response_data .find ("{" )
156- end_index = llm_response_data .rfind ("}" )
139+ start_index = llm_response_data .find ('{' )
140+ end_index = llm_response_data .rfind ('}' )
157141
158142 try :
159- json_response = json .loads (
160- llm_response_data [start_index : end_index + 1 ].strip ()
161- )
143+ json_response = json .loads (llm_response_data [start_index :end_index + 1 ].strip ())
162144 except Exception as e :
163- print (f" Error while parsing JSON response - { e } " )
145+ print (f' Error while parsing JSON response - { e } ' )
164146 json_response = {}
165147
166148 return json_response
0 commit comments