@@ -80,8 +80,8 @@ def draw_coordinates_boxes_on_image(image_path, coordinates, output_image_path,
8080parser .add_argument ('--use_som' , type = int , default = 1 ) # for action
8181parser .add_argument ('--draw_text_box' , type = int , default = 0 , help = "whether to draw text boxes in som." )
8282parser .add_argument ('--font_path' , type = str , default = "/System/Library/Fonts/Supplemental/Times New Roman.ttf" )
83+ # parser.add_argument('--font_path', type=str, default=r"C:\Windows\Fonts\arial.ttf")
8384parser .add_argument ('--add_info' , type = str , default = "Click the search bar in the middle of the page to search" )
84-
8585parser .add_argument ('--disable_reflection' , type = int , default = 1 )
8686parser .add_argument ('--clear_history_each_subtask' , type = int , default = 1 )
8787parser .add_argument ('--ratio' , type = float , default = 1.0 ) # 1.0 for windows and 2.0 for mac
@@ -654,9 +654,25 @@ def check_subtask_dict(subtask_dict):
654654 # action = output_action.split("### Action ###")[-1].split("### Operation ###")[0].replace("\n", " ").replace(" ", " ").strip()
655655
656656 action_json = json .loads (output_action .split ('```json' )[- 1 ].split ('```' )[0 ])
657+
658+ # Denormalize coordinates for Qwen3-VL model which returns normalized coordinates
659+
657660 thought = action_json ['Thought' ]
658661 summary = action_json ['Summary' ]
659662 action = action_json ['Action' ]
663+
664+ if "qwen3-vl" in vl_model_version .lower ():
665+ if "Tap" in action or "TapIdx" in action :
666+ # Extract coordinates from action string
667+ if "(" in action and ")" in action :
668+ coord_str = action .split ("(" )[- 1 ].split (")" )[0 ]
669+ if "," in coord_str :
670+ x , y = map (float , coord_str .split ("," ))
671+ # Denormalize coordinates based on image dimensions
672+ x = int (x * width / 1000 )
673+ y = int (y * height / 1000 )
674+ # Update action with denormalized coordinates
675+ action = action .split ("(" )[0 ] + f"({ x } , { y } )"
660676
661677 chat_action = add_response ("assistant" , output_action , chat_action )
662678 status = "#" * 50 + " Decision " + "#" * 50
0 commit comments