88
99from PIL import Image
1010from ultralytics import YOLO
11- import google . generativeai as genai
11+
1212from operate .config import Config
1313from operate .exceptions import ModelNotRecognizedException
1414from operate .utils .screenshot import (
3535
3636# Load configuration
3737VERBOSE = Config ().verbose
38+ config = Config ()
39+ client = config .initialize_openai ()
3840
3941
4042async def get_next_action (model , messages , objective , session_id ):
43+ if VERBOSE :
44+ print ("[Self-Operating Computer][get_next_action]" )
45+ print ("[Self-Operating Computer][get_next_action] model" , model )
4146 if model == "gpt-4" :
4247 return call_gpt_4_vision_preview (messages ), None
4348 if model == "gpt-4-with-som" :
@@ -52,8 +57,6 @@ async def get_next_action(model, messages, objective, session_id):
5257
5358
5459def call_gpt_4_vision_preview (messages ):
55- config = Config ()
56- client = config .initialize_openai ()
5760 if VERBOSE :
5861 print ("[Self Operating Computer][get_next_action][call_gpt_4_v]" )
5962 time .sleep (1 )
@@ -137,7 +140,10 @@ def call_gemini_pro_vision(messages, objective):
137140 """
138141 Get the next action for Self-Operating Computer using Gemini Pro Vision
139142 """
140- config = Config ()
143+ if VERBOSE :
144+ print (
145+ "[Self Operating Computer][call_gemini_pro_vision]" ,
146+ )
141147 # sleep for a second
142148 time .sleep (1 )
143149 try :
@@ -152,11 +158,18 @@ def call_gemini_pro_vision(messages, objective):
152158 time .sleep (1 )
153159 prompt = get_system_prompt (objective )
154160
155- model = genai .GenerativeModel ("gemini-pro-vision" )
161+ model = config .initialize_google ()
162+ if VERBOSE :
163+ print ("[Self Operating Computer][call_gemini_pro_vision] model" , model )
156164
157165 response = model .generate_content ([prompt , Image .open (screenshot_filename )])
158166
159167 content = response .text [1 :]
168+ if VERBOSE :
169+ print (
170+ "[Self Operating Computer][call_gemini_pro_vision] response" , response
171+ )
172+ print ("[Self Operating Computer][call_gemini_pro_vision] content" , content )
160173
161174 content = json .loads (content )
162175 if VERBOSE :
@@ -176,8 +189,6 @@ def call_gemini_pro_vision(messages, objective):
176189
177190
178191async def call_gpt_4_vision_preview_labeled (messages , objective ):
179- config = Config ()
180- client = config .initialize_openai ()
181192 time .sleep (1 )
182193 try :
183194 yolo_model = YOLO ("./operate/models/weights/best.pt" ) # Load your trained model
0 commit comments