Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -217,4 +217,7 @@ playground/.env

!launcher/**
multi_app_tasks_backup
multi_app_tasks
multi_app_tasks

!evaluation/AndroidWorld/android_world/env/
!evaluation/AndroidWorld/android_world/env/**/*
16 changes: 16 additions & 0 deletions evaluation/AndroidWorld/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,19 @@ Please refer to [AndroidWorld README](docs/README_AndroidWorld.md)
* **\[CONSOLE\_PORT]** is the port for the agent’s console
* **\[CHECKPOINT\_DIR]** is the path to the directory containing your model checkpoints
* **\[GRPC\_PORT]** is the port for the gRPC service

## Qwen3-VL Model Evaluation

We have adapted the prompts and action space of the Qwen3-VL series models to reproduce their evaluation results.

1. **Launch the Android emulator first (example):**
```bash
emulator -avd AndroidWorldAVD -no-snapshot -grpc 8554
```

2. **After deploying your model API with `vLLM` (refer to [model development](../README.md#-model-development)), configure the `model_url` and `model_name`, e.g., `http://<ip>:8000/v1` and `Qwen3-VL-8B-Instruct`.**

3. **Run the evaluation using the following script (example):**
```
python run.py --agent_name qwen3vl --console_port 5554 --grpc_port 8554 --perform_emulator_setup=true --qwen3vl_model_base_url model_url --qwen3vl_model_name model_name --qwen3vl_model_api_key EMPTY --checkpoint_dir runs/qwen3vl_8b_instruct
```
8 changes: 8 additions & 0 deletions evaluation/AndroidWorld/android_world/agents/PROMPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@
"""
)

# =========================
# Qwen3VL tool-call prompts
# =========================

QWEN3VL_SYSTEM_PROMPT = "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is 999x999.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `answer`: Output the answer.\\n* `system_button`: Press the system button.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"click\", \"long_press\", \"swipe\", \"type\", \"answer\", \"system_button\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=type` and `action=answer`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n\n# Response format\n\nResponse format for every step:\n1) Thought: one concise sentence explaining the next move (no multi-step reasoning).\n2) Action: a short imperative describing what to do in the UI.\n3) A single <tool_call>...</tool_call> block containing only the JSON: {\"name\": <function-name>, \"arguments\": <args-json-object>}.\n\nRules:\n- Output exactly in the order: Thought, Action, <tool_call>.\n- Be brief: one sentence for Thought, one for Action.\n- Do not output anything else outside those three parts.\n- If finishing, use action=terminate in the tool call."

QWEN3VL_USER_PROMPT = "The user query: {instruction}.\nTask progress (You have done the following operation on the current device): {history}.\n"


SUMMARY_PROMPT_TEMPLATE = (
PROMPT_PREFIX
Expand Down
190 changes: 189 additions & 1 deletion evaluation/AndroidWorld/android_world/agents/seeact_v.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import time

import ast
import json
import numpy as np
from PIL import Image
from openai import OpenAI
Expand All @@ -40,7 +41,13 @@
from android_world.env import interface
from android_world.env import json_action
from android_world.env import representation_utils
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize

try:
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import ( # type: ignore
smart_resize,
)
except Exception: # pragma: no cover
smart_resize = None # type: ignore[assignment]

# Utils for Visual Grounding

Expand Down Expand Up @@ -932,3 +939,184 @@ def _to_base64_png(image: np.ndarray) -> str:
buf = BytesIO()
PILImage.fromarray(image).save(buf, format="PNG")
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"


def _extract_action_text_qwen3vl(block: str) -> str:
"""Extracts the 'Action:' line from Qwen3VL text output for step history (does not affect execution)."""
m = re.search(r"Action:\s*(.+?)(?:\n<tool_call>|$)", block, flags=re.S)
if not m:
return ""
text = m.group(1).strip()
# Some models wrap Action: "..." with quotes.
if text.startswith('"') and text.endswith('"'):
text = text[1:-1]
return text.replace("\n", " ")


def _parse_tool_call_json(block: str) -> dict[str, Any] | None:
"""Parse JSON inside <tool_call>...</tool_call>."""
m = re.search(r"<tool_call>\s*([\s\S]*?)\s*</tool_call>", block)
if not m:
return None
payload = m.group(1).strip()
try:
return json.loads(payload)
except Exception:
return None


class Qwen3VL(base_agent.EnvironmentInteractingAgent):
"""Android GUI Agent based on Qwen3VL tool-call output (for AndroidWorld eval).

- Input: Screenshot + instruction + history
- Output: <tool_call>{...}</tool_call>
- Execution: Map to JSONAction by qwen3vl_action_transform(...)
"""

def __init__(
self,
env: interface.AsyncEnv,
llm: infer.MultimodalLlmWrapper,
name: str = "Qwen3VL",
wait_after_action_seconds: float = 2.0,
model_base_url: str = "http://127.0.0.1:8000/v1",
model_api_key: str = "EMPTY",
model_name: str = "",
extra_headers: dict[str, str] | None = None,
):
super().__init__(env, name)
self.llm = llm
self.wait_after_action_seconds = wait_after_action_seconds
self.model_name = model_name
self.client = OpenAI(
api_key=model_api_key,
base_url=model_base_url,
default_headers=extra_headers,
)
self.step_his: str = ""
self.turn_number: int = 0
# Used to detect repeated actions (avoid infinite loops)
self.last_action: str | None = None
self.repeat_time: int = 0

def reset(self, go_home_on_reset: bool = False):
super().reset(go_home_on_reset)
self.env.hide_automation_ui()
self.step_his = ""
self.turn_number = 0
self.last_action = None
self.repeat_time = 0

@staticmethod
def _to_base64_png(image: np.ndarray) -> str:
import base64
from io import BytesIO
from PIL import Image as PILImage
buf = BytesIO()
PILImage.fromarray(image).save(buf, format='PNG')
return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"

def step(self, instruction: str) -> base_agent.AgentInteractionResult:
self.turn_number += 1

state = self.get_post_transition_state()
screenshot = state.pixels.copy()
# To be consistent with other agents in this file: BGR->RGB (for saving/encoding)
screenshot = screenshot[:, :, ::-1]
height, width = screenshot.shape[:2]

system_prompt = QWEN3VL_SYSTEM_PROMPT
user_prompt = QWEN3VL_USER_PROMPT.format(
instruction=instruction, history=self.step_his
)

messages = [
{
"role": "system",
"content": [{"type": "text", "text": system_prompt}],
},
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{"type": "image_url", "image_url": {"url": self._to_base64_png(screenshot)}},
],
},
]

completion = self.client.chat.completions.create(
model=self.model_name,
messages=messages,
temperature=0,
)
response = completion.choices[0].message.content or ""
print(response)
print("=" * 50)

tool_call = _parse_tool_call_json(response)
if not tool_call:
return base_agent.AgentInteractionResult(
True, {"summary": "No <tool_call> JSON found in model output.", "response": response}
)

op_text = _extract_action_text_qwen3vl(response)
if op_text:
self.step_his += f"Step {self.turn_number}: {op_text}; "

# Compatible: tool_call may look like {"name":"mobile_use","arguments":{...}}
args = tool_call.get("arguments", {}) if isinstance(tool_call, dict) else {}
action_name = args.get("action", "")
try:
parsed = qwen3vl_action_transform(action_name, args, width, height)
print(parsed)
except Exception as e:
return base_agent.AgentInteractionResult(
True,
{
"summary": f"Failed to transform tool-call into action: {e}",
"response": response,
"tool_call": tool_call,
},
)

# Record last_action + repeat_time (previous code had these fields but not working)
# Here, use the tool-call's arguments as the "action signature", which is more robust than checking 'terminate' in a string.
try:
action_sig = json.dumps(args, ensure_ascii=False, sort_keys=True)
except Exception:
action_sig = str(args)
if self.last_action == action_sig:
self.repeat_time += 1
else:
self.repeat_time = 0
self.last_action = action_sig

try:
act = json_action.JSONAction(**parsed)
self.env.execute_action(act)
time.sleep(self.wait_after_action_seconds)
except Exception:
# continue
print("Failed to execute action:", parsed)

if parsed.get("action_type") == "status":
return base_agent.AgentInteractionResult(
True, {"response": response, "step_history": self.step_his, "parsed": parsed}
)

# If repeated actions reach the threshold: terminate immediately to avoid deadlock in evaluation
if self.repeat_time >= 3:
return base_agent.AgentInteractionResult(
True,
{
"summary": "Terminated due to repeated identical actions.",
"response": response,
"step_history": self.step_his,
"parsed": parsed,
"repeat_time": self.repeat_time,
},
)

return base_agent.AgentInteractionResult(
False, {"response": response, "step_history": self.step_his, "parsed": parsed}
)
62 changes: 61 additions & 1 deletion evaluation/AndroidWorld/android_world/agents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,13 @@
from android_world.env import interface
from android_world.env import json_action
from android_world.agents import base_agent
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize

try:
from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import ( # type: ignore
smart_resize,
)
except Exception: # pragma: no cover
smart_resize = None # type: ignore[assignment]


def _extract_xy(s: str) -> Tuple[float, float] | None:
Expand Down Expand Up @@ -156,6 +162,60 @@ def action_transform(action: str, width: int, height: int) -> Dict[str, Any] | N
return None


def qwen3vl_action_transform(action, arguments, width, height) -> Dict[str, Any]:
if action == "key":
return {"action_type": "wait"}
elif action == "click" or action == "left_click":
coordinate = arguments.get("coordinate", [0, 0])
x, y = coordinate
x = x / 1000 * width
y = y / 1000 * height
return {"action_type": "click", "x": x, "y": y}
elif action == "long_press":
coordinate = arguments.get("coordinate", [0, 0])
x, y = coordinate
x = x / 1000 * width
y = y / 1000 * height
return {"action_type": "long_press", "x": x, "y": y}
elif action == "swipe":
coordinate = arguments.get("coordinate", [0, 0])
coordinate2 = arguments.get("coordinate2", [0, 0])
x0, y0 = coordinate[0]/1000 * width, coordinate[1]/1000 * height
x1, y1 = coordinate2[0]/1000 * width, coordinate2[1]/1000 * height
dir_ = _dir_from_coords(x0, y0, x1, y1)
return {"action_type": "scroll", "direction": reverse_direction(dir_)}
elif action == "type":
text = arguments.get("text", "")
return {"action_type": "input_text", "text": text}
elif action == "system_button":
button = arguments.get("button", "").lower()
if button == "home":
return {"action_type": "navigate_home"}
elif button == "back":
return {"action_type": "navigate_back"}
else:
raise ValueError(f"Unknown system button: {button}")
elif action == "open":
text = arguments.get("text", "")
return {"action_type": "open_app", "app_name": text}
elif action == "wait":
return {"action_type": "wait"}
elif action == "answer":
return {"action_type": "answer", "text": arguments.get("text", "")}
elif action == "terminate":
status = arguments.get("status", "").lower()
if status == "success":
return {"action_type": "status", "goal_status": "complete"}
elif status == "failure":
return {"action_type": "status", "goal_status": "infeasible"}
else:
raise ValueError(f"Unknown terminate status: {status}")
# else:
# raise ValueError(f"Unknown action: {action}")
else:
return {'action_type': 'wait'}


def action_coord(action):
def extract_click_json(s):
m = re.search(
Expand Down
Loading