Skip to content

Commit b1a3924

Browse files
Merge pull request #220 from 1liuren/qwen3-vl
优化prompt、同时将反思阶段替换为vl模型、增加只使用截图的功能开关
2 parents 4ae8478 + f4c71ea commit b1a3924

File tree

2 files changed

+113
-57
lines changed

2 files changed

+113
-57
lines changed

PC-Agent/PCAgent/prompt_qwen.py

Lines changed: 72 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -139,18 +139,20 @@ def get_action_prompt(instruction, clickable_infos, width, height, thought_histo
139139
prompt += add_info
140140
prompt += "\n\n"
141141

142-
prompt += "### Screenshot information ###\n"
143-
prompt += "In order to help you better perceive the content in this screenshot, we extract some information of the current screenshot. "
144-
prompt += "This information consists of two parts: coordinates; content. "
145-
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; "
146-
147-
148-
prompt += "the content is a text or 'icon' respectively. "
149-
prompt += "The information is as follow:\n"
150-
151-
for clickable_info in clickable_infos:
152-
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
153-
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
142+
# 只有当有perception info时才添加这部分
143+
if clickable_infos and len(clickable_infos) > 0:
144+
prompt += "### Screenshot information ###\n"
145+
prompt += "In order to help you better perceive the content in this screenshot, we extract some information of the current screenshot. "
146+
prompt += "This information consists of two parts: coordinates; content. "
147+
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; "
148+
149+
prompt += "the content is a text or 'icon' respectively. "
150+
prompt += "The information is as follow:\n"
151+
152+
for clickable_info in clickable_infos:
153+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
154+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
155+
prompt += "\n"
154156

155157

156158
if len(action_history) > 0:
@@ -183,10 +185,17 @@ def get_action_prompt(instruction, clickable_infos, width, height, thought_histo
183185

184186
prompt += "### Task requirements ###\n"
185187
prompt += "In order to meet the user\'s requirements, you need to select one of the following operations to operate on the current screen:\n"
186-
prompt += "Note that to open an app, use the Open App action, rather than tapping the app's icon. "
187-
prompt += "For certain items that require selection, such as font and font size, direct input is more efficient than scrolling through choices."
188+
189+
# 根据是否有perception info,提供不同的坐标说明
190+
if clickable_infos and len(clickable_infos) > 0:
191+
prompt += "Note: The coordinates in the ### Screenshot information ### section are in pixel format [x, y]. When you output Tap actions, use the same pixel coordinates.\n"
192+
else:
193+
prompt += "Note: Since no extracted information is provided, you need to directly analyze the screenshot and output normalized coordinates.\n"
194+
prompt += "For Tap actions, use normalized coordinates where x and y are in the range [0, 999], with (0, 0) at the top-left corner and (999, 999) at the bottom-right corner.\n"
195+
196+
prompt += "For certain items that require selection, such as font and font size, direct input is more efficient than scrolling through choices.\n"
188197
prompt += "You must choose one of the actions below:\n"
189-
prompt += "Open App (app name): If you want to open an app, you should use this action to open the app named 'app name'."
198+
# prompt += "Open App (app name): If you want to open an app, you should use this action to open the app named 'app name'."
190199
prompt += "Right Tap (x, y): Right tap the position (x, y) in current page. This can be used to create a new file.\n"
191200
prompt += "Tap (x, y): Tap the position (x, y) in current page. This can be used to select an item.\n"
192201
prompt += "Double Tap (x, y): Double tap the position (x, y) in the current page. This can be used to open a file. If Tap (x, y) in the last step doesn't work, you can try double tap the position (x, y) in the current page.\n"
@@ -215,15 +224,19 @@ def get_action_prompt(instruction, clickable_infos, width, height, thought_histo
215224
prompt += "Append (x, y), (text): Append the \"text\" content after the content at (x, y) location. This action is useful when you want to append new content into a word document.\n"
216225

217226
prompt += "Tell (answer): Tell me the answer of the input query.\n"
218-
prompt += "Stop: If all the operations to meet the user\'s requirements have been completed in ### History operation ###, use this operation to stop the whole process."
227+
prompt += "Stop: ONLY use this action when you can VERIFY from the CURRENT SCREENSHOT that ALL requirements in the user's instruction have been ACTUALLY COMPLETED. Do NOT stop just because you performed some operations - you must verify the final result is achieved on the screen."
219228
prompt += "\n\n"
220229

221230
prompt += "### Output format ###\n"
222231
# modified 2.10
223232
prompt += "You should output in the following json format:"
233+
# prompt += '''
234+
# {"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Open App () or Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
235+
# '''
224236
prompt += '''
225-
{"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Open App () or Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
226-
'''
237+
{"Thought": "This is your thinking about how to proceed the next operation, please output the thoughts about the history operations explicitly.", "Action": "Tap () or Double Tap () or Triple Tap () or Shortcut () or Press() or Type () or Tell () or Stop. Only one action can be output at one time.", "Summary": "This is a one sentence summary of this operation."}
238+
'''
239+
prompt += "The output must contain the following fields: Thought (your reasoning about the next operation), Action (the specific action to take), and Summary (a one-sentence summary of the operation)."
227240
prompt += "\n\n"
228241

229242

@@ -243,19 +256,22 @@ def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, h
243256
prompt += "The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively "
244257
prompt += "\n\n"
245258

246-
prompt += "### Before the current operation ###\n"
247-
prompt += "Screenshot information:\n"
248-
for clickable_info in clickable_infos1:
249-
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
250-
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
251-
prompt += "\n\n"
252-
253-
prompt += "### After the current operation ###\n"
254-
prompt += "Screenshot information:\n"
255-
for clickable_info in clickable_infos2:
256-
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
257-
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
258-
prompt += "\n\n"
259+
# 只有当有perception info时才添加屏幕信息
260+
if clickable_infos1 and len(clickable_infos1) > 0:
261+
prompt += "### Before the current operation ###\n"
262+
prompt += "Screenshot information:\n"
263+
for clickable_info in clickable_infos1:
264+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
265+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
266+
prompt += "\n\n"
267+
268+
if clickable_infos2 and len(clickable_infos2) > 0:
269+
prompt += "### After the current operation ###\n"
270+
prompt += "Screenshot information:\n"
271+
for clickable_info in clickable_infos2:
272+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
273+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
274+
prompt += "\n\n"
259275

260276
prompt += "### Current operation ###\n"
261277
prompt += f"The user\'s instruction is: {instruction}."
@@ -271,16 +287,19 @@ def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width, h
271287
prompt += "Now you need to output the following content based on the screenshots information before and after the current operation:\n"
272288
else:
273289
prompt += "Now you need to output the following content based on the screenshots before and after the current operation:\n"
274-
prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
275-
prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
290+
prompt += "1. Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
291+
prompt += "2. IMPORTANT: By carefully examining the screenshot after the operation, verify if the actual goal described in the user's instruction is achieved.\n"
292+
prompt += "Choose one of the following:\n"
293+
prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\" AND the actual goal in the instruction is achieved based on the current screenshot.\n"
276294
prompt += "B: The \"Operation action\" results in a wrong page and I need to do something to correct this.\n"
277-
prompt += "C: The \"Operation action\" produces no changes."
295+
prompt += "C: The \"Operation action\" produces no changes.\n"
296+
prompt += "D: The \"Operation action\" seems to complete, but the actual goal in the instruction is NOT achieved based on the current screenshot (e.g., clicked wrong position, wrong item selected)."
278297
prompt += "\n\n"
279298

280299
prompt += "### Output format ###\n"
281300
prompt += "Your output format is:\n"
282-
prompt += "### Thought ###\nYour thought about the question\n"
283-
prompt += "### Answer ###\nA or B or C"
301+
prompt += "### Thought ###\nYour thought about the question. Please explicitly verify if the goal in the instruction is achieved by checking the screenshot.\n"
302+
prompt += "### Answer ###\nA or B or C or D"
284303

285304
return prompt
286305

@@ -304,10 +323,20 @@ def get_memory_prompt(insight):
304323

305324
return prompt
306325

307-
def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info, reflection_history=[]):
326+
def get_process_prompt(instruction, thought_history, summary_history, action_history, completed_content, add_info, reflection_history=[], clickable_infos=None, width=None, height=None):
308327
prompt = "### Background ###\n"
309328
prompt += f"There is an user\'s instruction which is: {instruction}. You are a computer operating assistant and are operating the user\'s computer.\n\n"
310329

330+
# 添加当前屏幕信息用于验证
331+
if clickable_infos is not None and width is not None and height is not None:
332+
prompt += "### Current screenshot information ###\n"
333+
prompt += f"The current screen width is {width} pixels and height is {height} pixels.\n"
334+
prompt += "The following is the information extracted from the current screenshot (format: coordinates; content):\n"
335+
for clickable_info in clickable_infos:
336+
if clickable_info['text'] != "" and clickable_info['text'] != "icon: None" and clickable_info['coordinates'] != (0, 0):
337+
prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
338+
prompt += "\n"
339+
311340
if add_info != "":
312341
prompt += "### Hint ###\n"
313342
prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
@@ -330,11 +359,13 @@ def get_process_prompt(instruction, thought_history, summary_history, action_his
330359
prompt += "Completed contents:\n" + completed_content + "\n\n"
331360

332361
prompt += "### Response requirements ###\n"
333-
prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
362+
prompt += "Now you need to update the \"Completed contents\" by comparing the user's instruction with the current screenshot.\n"
363+
prompt += "IMPORTANT: You must verify if the actual goal is achieved by checking the current screenshot information, not just assuming based on operation history.\n"
364+
prompt += "For example, if the instruction is to 'play 稻香', you need to verify if 稻香 is actually playing on the screen, not just because you clicked something.\n\n"
334365

335366
prompt += "### Output format ###\n"
336367
prompt += "Your output format is:\n"
337-
prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
368+
prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed AND VERIFIED on the current screenshot."
338369

339370
else:
340371
prompt += "### Current operation ###\n"
@@ -345,19 +376,15 @@ def get_process_prompt(instruction, thought_history, summary_history, action_his
345376
prompt += f"Operation action: {operation}\n" + "Operation reflection: " + reflection_history[-1] + "\n\n"
346377
else:
347378
prompt += f"Operation action: {operation}\n\n"
348-
349-
# if reflection_thought is not None:
350-
# prompt += "A reflection model was adopted to analyze whether the last step's operation meets the expectation, you should combine its reflection thought to produce the \"Completed contents\"."
351-
# prompt += "Below is its reflection thought:\n"
352-
# prompt += reflection_thought + "\n"
353379

354380
prompt += "### Response requirements ###\n"
355381
prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
382+
prompt += "IMPORTANT: You must verify if the actual goal is achieved by checking the current screenshot information, not just assuming based on operation.\n"
356383
prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
357384

358385
prompt += "### Output format ###\n"
359386
prompt += "Your output format is:\n"
360-
prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
387+
prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed AND VERIFIED on the current screenshot.\n"
361388
prompt += "(Please use English to output)"
362389

363390
return prompt

0 commit comments

Comments
 (0)