OpenGVLab · heroding77 · Jan 7, 2026 · Jan 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -217,4 +217,7 @@ playground/.env
 
 !launcher/**
 multi_app_tasks_backup
-multi_app_tasks
+multi_app_tasks
+
+!evaluation/AndroidWorld/android_world/env/
+!evaluation/AndroidWorld/android_world/env/**/*
diff --git a/evaluation/AndroidWorld/README.md b/evaluation/AndroidWorld/README.md
@@ -24,3 +24,19 @@ Please refer to [AndroidWorld README](docs/README_AndroidWorld.md)
     * **\[CONSOLE\_PORT]** is the port for the agent’s console
     * **\[CHECKPOINT\_DIR]** is the path to the directory containing your model checkpoints
     * **\[GRPC\_PORT]** is the port for the gRPC service
+
+## Qwen3-VL Model Evaluation
+
+We have adapted the prompts and action space of the Qwen3-VL series models to reproduce their evaluation results.
+
+1. **Launch the Android emulator first (example):**
+   ```bash
+   emulator -avd AndroidWorldAVD -no-snapshot -grpc 8554
+   ```
+
+2. **After deploying your model API with `vLLM` (refer to [model development](../README.md#-model-development)), configure the `model_url` and `model_name`, e.g., `http://<ip>:8000/v1` and `Qwen3-VL-8B-Instruct`.**
+
+3. **Run the evaluation using the following script (example):**
+    ```
+    python run.py --agent_name qwen3vl --console_port 5554 --grpc_port 8554 --perform_emulator_setup=true --qwen3vl_model_base_url model_url --qwen3vl_model_name model_name --qwen3vl_model_api_key EMPTY --checkpoint_dir runs/qwen3vl_8b_instruct
+    ```
diff --git a/evaluation/AndroidWorld/android_world/agents/PROMPT.py b/evaluation/AndroidWorld/android_world/agents/PROMPT.py
@@ -79,6 +79,14 @@
 """
 )
 
+# =========================
+# Qwen3VL tool-call prompts
+# =========================
+
+QWEN3VL_SYSTEM_PROMPT = "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is 999x999.\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `answer`: Output the answer.\\n* `system_button`: Press the system button.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"click\", \"long_press\", \"swipe\", \"type\", \"answer\", \"system_button\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=type` and `action=answer`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n\n# Response format\n\nResponse format for every step:\n1) Thought: one concise sentence explaining the next move (no multi-step reasoning).\n2) Action: a short imperative describing what to do in the UI.\n3) A single <tool_call>...</tool_call> block containing only the JSON: {\"name\": <function-name>, \"arguments\": <args-json-object>}.\n\nRules:\n- Output exactly in the order: Thought, Action, <tool_call>.\n- Be brief: one sentence for Thought, one for Action.\n- Do not output anything else outside those three parts.\n- If finishing, use action=terminate in the tool call."
+
+QWEN3VL_USER_PROMPT = "The user query: {instruction}.\nTask progress (You have done the following operation on the current device): {history}.\n"
+
 
 SUMMARY_PROMPT_TEMPLATE = (
     PROMPT_PREFIX

diff --git a/evaluation/AndroidWorld/android_world/agents/seeact_v.py b/evaluation/AndroidWorld/android_world/agents/seeact_v.py
@@ -18,6 +18,7 @@
 import time
 
 import ast
+import json
 import numpy as np
 from PIL import Image
 from openai import OpenAI
@@ -40,7 +41,13 @@
 from android_world.env import interface
 from android_world.env import json_action
 from android_world.env import representation_utils
-from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+
+try:
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import (  # type: ignore
+        smart_resize,
+    )
+except Exception:  # pragma: no cover
+    smart_resize = None  # type: ignore[assignment]
 
 # Utils for Visual Grounding
 
@@ -932,3 +939,184 @@ def _to_base64_png(image: np.ndarray) -> str:
         buf = BytesIO()
         PILImage.fromarray(image).save(buf, format="PNG")
         return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"
+
+
+def _extract_action_text_qwen3vl(block: str) -> str:
+    """Extracts the 'Action:' line from Qwen3VL text output for step history (does not affect execution)."""
+    m = re.search(r"Action:\s*(.+?)(?:\n<tool_call>|$)", block, flags=re.S)
+    if not m:
+        return ""
+    text = m.group(1).strip()
+    # Some models wrap Action: "..." with quotes.
+    if text.startswith('"') and text.endswith('"'):
+        text = text[1:-1]
+    return text.replace("\n", " ")
+
+
+def _parse_tool_call_json(block: str) -> dict[str, Any] | None:
+    """Parse JSON inside <tool_call>...</tool_call>."""
+    m = re.search(r"<tool_call>\s*([\s\S]*?)\s*</tool_call>", block)
+    if not m:
+        return None
+    payload = m.group(1).strip()
+    try:
+        return json.loads(payload)
+    except Exception:
+        return None
+
+
+class Qwen3VL(base_agent.EnvironmentInteractingAgent):
+    """Android GUI Agent based on Qwen3VL tool-call output (for AndroidWorld eval).
+
+    - Input: Screenshot + instruction + history
+    - Output: <tool_call>{...}</tool_call>
+    - Execution: Map to JSONAction by qwen3vl_action_transform(...)
+    """
+
+    def __init__(
+        self,
+        env: interface.AsyncEnv,
+        llm: infer.MultimodalLlmWrapper,
+        name: str = "Qwen3VL",
+        wait_after_action_seconds: float = 2.0,
+        model_base_url: str = "http://127.0.0.1:8000/v1",
+        model_api_key: str = "EMPTY",
+        model_name: str = "",
+        extra_headers: dict[str, str] | None = None,
+    ):
+        super().__init__(env, name)
+        self.llm = llm
+        self.wait_after_action_seconds = wait_after_action_seconds
+        self.model_name = model_name
+        self.client = OpenAI(
+            api_key=model_api_key,
+            base_url=model_base_url,
+            default_headers=extra_headers,
+        )
+        self.step_his: str = ""
+        self.turn_number: int = 0
+        # Used to detect repeated actions (avoid infinite loops)
+        self.last_action: str | None = None
+        self.repeat_time: int = 0
+
+    def reset(self, go_home_on_reset: bool = False):
+        super().reset(go_home_on_reset)
+        self.env.hide_automation_ui()
+        self.step_his = ""
+        self.turn_number = 0
+        self.last_action = None
+        self.repeat_time = 0
+
+    @staticmethod
+    def _to_base64_png(image: np.ndarray) -> str:
+        import base64
+        from io import BytesIO
+        from PIL import Image as PILImage
+        buf = BytesIO()
+        PILImage.fromarray(image).save(buf, format='PNG')
+        return f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"
+
+    def step(self, instruction: str) -> base_agent.AgentInteractionResult:
+        self.turn_number += 1
+
+        state = self.get_post_transition_state()
+        screenshot = state.pixels.copy()
+        # To be consistent with other agents in this file: BGR->RGB (for saving/encoding)
+        screenshot = screenshot[:, :, ::-1]
+        height, width = screenshot.shape[:2]
+
+        system_prompt = QWEN3VL_SYSTEM_PROMPT
+        user_prompt = QWEN3VL_USER_PROMPT.format(
+            instruction=instruction, history=self.step_his
+        )
+
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_prompt}],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt},
+                    {"type": "image_url", "image_url": {"url": self._to_base64_png(screenshot)}},
+                ],
+            },
+        ]
+
+        completion = self.client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            temperature=0,
+        )
+        response = completion.choices[0].message.content or ""
+        print(response)
+        print("=" * 50)
+
+        tool_call = _parse_tool_call_json(response)
+        if not tool_call:
+            return base_agent.AgentInteractionResult(
+                True, {"summary": "No <tool_call> JSON found in model output.", "response": response}
+            )
+
+        op_text = _extract_action_text_qwen3vl(response)
+        if op_text:
+            self.step_his += f"Step {self.turn_number}: {op_text}; "
+
+        # Compatible: tool_call may look like {"name":"mobile_use","arguments":{...}}
+        args = tool_call.get("arguments", {}) if isinstance(tool_call, dict) else {}
+        action_name = args.get("action", "")
+        try:
+            parsed = qwen3vl_action_transform(action_name, args, width, height)
+            print(parsed)
+        except Exception as e:
+            return base_agent.AgentInteractionResult(
+                True,
+                {
+                    "summary": f"Failed to transform tool-call into action: {e}",
+                    "response": response,
+                    "tool_call": tool_call,
+                },
+            )
+
+        # Record last_action + repeat_time (previous code had these fields but not working)
+        # Here, use the tool-call's arguments as the "action signature", which is more robust than checking 'terminate' in a string.
+        try:
+            action_sig = json.dumps(args, ensure_ascii=False, sort_keys=True)
+        except Exception:
+            action_sig = str(args)
+        if self.last_action == action_sig:
+            self.repeat_time += 1
+        else:
+            self.repeat_time = 0
+        self.last_action = action_sig
+
+        try:
+            act = json_action.JSONAction(**parsed)
+            self.env.execute_action(act)
+            time.sleep(self.wait_after_action_seconds)
+        except Exception:
+            # continue
+            print("Failed to execute action:", parsed)
+
+        if parsed.get("action_type") == "status":
+            return base_agent.AgentInteractionResult(
+                True, {"response": response, "step_history": self.step_his, "parsed": parsed}
+            )
+
+        # If repeated actions reach the threshold: terminate immediately to avoid deadlock in evaluation
+        if self.repeat_time >= 3:
+            return base_agent.AgentInteractionResult(
+                True,
+                {
+                    "summary": "Terminated due to repeated identical actions.",
+                    "response": response,
+                    "step_history": self.step_his,
+                    "parsed": parsed,
+                    "repeat_time": self.repeat_time,
+                },
+            )
+
+        return base_agent.AgentInteractionResult(
+            False, {"response": response, "step_history": self.step_his, "parsed": parsed}
+        )
diff --git a/evaluation/AndroidWorld/android_world/agents/utils.py b/evaluation/AndroidWorld/android_world/agents/utils.py
@@ -7,7 +7,13 @@
 from android_world.env import interface
 from android_world.env import json_action
 from android_world.agents import base_agent
-from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+
+try:
+    from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import (  # type: ignore
+        smart_resize,
+    )
+except Exception:  # pragma: no cover
+    smart_resize = None  # type: ignore[assignment]
 
 
 def _extract_xy(s: str) -> Tuple[float, float] | None:
@@ -156,6 +162,60 @@ def action_transform(action: str, width: int, height: int) -> Dict[str, Any] | N
     return None
 
 
+def qwen3vl_action_transform(action, arguments, width, height) -> Dict[str, Any]:
+    if action == "key":
+        return {"action_type": "wait"}
+    elif action == "click" or action == "left_click":
+        coordinate = arguments.get("coordinate", [0, 0])
+        x, y = coordinate
+        x = x / 1000 * width
+        y = y / 1000 * height
+        return {"action_type": "click", "x": x, "y": y}
+    elif action == "long_press":
+        coordinate = arguments.get("coordinate", [0, 0])
+        x, y = coordinate
+        x = x / 1000 * width
+        y = y / 1000 * height
+        return {"action_type": "long_press", "x": x, "y": y}
+    elif action == "swipe":
+        coordinate = arguments.get("coordinate", [0, 0])
+        coordinate2 = arguments.get("coordinate2", [0, 0])
+        x0, y0 = coordinate[0]/1000 * width, coordinate[1]/1000 * height
+        x1, y1 = coordinate2[0]/1000 * width, coordinate2[1]/1000 * height
+        dir_ = _dir_from_coords(x0, y0, x1, y1)
+        return {"action_type": "scroll", "direction": reverse_direction(dir_)}
+    elif action == "type":
+        text = arguments.get("text", "")
+        return {"action_type": "input_text", "text": text}
+    elif action == "system_button":
+        button = arguments.get("button", "").lower()
+        if button == "home":
+            return {"action_type": "navigate_home"}
+        elif button == "back":
+            return {"action_type": "navigate_back"}
+        else:
+            raise ValueError(f"Unknown system button: {button}")
+    elif action == "open":
+        text = arguments.get("text", "")
+        return {"action_type": "open_app", "app_name": text}
+    elif action == "wait":
+        return {"action_type": "wait"}
+    elif action == "answer":
+        return {"action_type": "answer", "text": arguments.get("text", "")}
+    elif action == "terminate":
+        status = arguments.get("status", "").lower()
+        if status == "success":
+            return {"action_type": "status", "goal_status": "complete"}
+        elif status == "failure":
+            return {"action_type": "status", "goal_status": "infeasible"}
+        else:
+            raise ValueError(f"Unknown terminate status: {status}")
+    # else:
+    #     raise ValueError(f"Unknown action: {action}")
+    else:
+        return {'action_type': 'wait'}
+
+
 def action_coord(action):
     def extract_click_json(s):
         m = re.search(