Skip to content

Commit a749e13

Browse files
Merge pull request #208 from 1liuren/qwen3-vl
增加qwen3-vl的反归一化处理
2 parents 6ea68d4 + f8d1348 commit a749e13

File tree

1 file changed

+17
-1
lines changed

1 file changed

+17
-1
lines changed

PC-Agent/run.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ def draw_coordinates_boxes_on_image(image_path, coordinates, output_image_path,
8080
parser.add_argument('--use_som', type=int, default=1) # for action
8181
parser.add_argument('--draw_text_box', type=int, default=0, help="whether to draw text boxes in som.")
8282
parser.add_argument('--font_path', type=str, default="/System/Library/Fonts/Supplemental/Times New Roman.ttf")
83+
# parser.add_argument('--font_path', type=str, default=r"C:\Windows\Fonts\arial.ttf")
8384
parser.add_argument('--add_info', type=str, default="Click the search bar in the middle of the page to search")
84-
8585
parser.add_argument('--disable_reflection', type=int, default=1)
8686
parser.add_argument('--clear_history_each_subtask', type=int, default=1)
8787
parser.add_argument('--ratio', type=float, default=1.0) # 1.0 for windows and 2.0 for mac
@@ -654,9 +654,25 @@ def check_subtask_dict(subtask_dict):
654654
# action = output_action.split("### Action ###")[-1].split("### Operation ###")[0].replace("\n", " ").replace(" ", " ").strip()
655655

656656
action_json = json.loads(output_action.split('```json')[-1].split('```')[0])
657+
658+
# Denormalize coordinates for Qwen3-VL model which returns normalized coordinates
659+
657660
thought = action_json['Thought']
658661
summary = action_json['Summary']
659662
action = action_json['Action']
663+
664+
if "qwen3-vl" in vl_model_version.lower():
665+
if "Tap" in action or "TapIdx" in action:
666+
# Extract coordinates from action string
667+
if "(" in action and ")" in action:
668+
coord_str = action.split("(")[-1].split(")")[0]
669+
if "," in coord_str:
670+
x, y = map(float, coord_str.split(","))
671+
# Denormalize coordinates based on image dimensions
672+
x = int(x * width / 1000)
673+
y = int(y * height / 1000)
674+
# Update action with denormalized coordinates
675+
action = action.split("(")[0] + f"({x}, {y})"
660676

661677
chat_action = add_response("assistant", output_action, chat_action)
662678
status = "#" * 50 + " Decision " + "#" * 50

0 commit comments

Comments
 (0)