Update readme

pooruss · pooruss · commit a67d3d8cae28 · 2025-05-21T19:42:31.000+08:00
diff --git a/README.md b/README.md
@@ -52,14 +52,39 @@ This includes instructions for model deployment using huggingface endpoint, and
 
 ### ✅ Step 2: Post Processing
 
-👉 <a href="codes/action_parser.py">Prediction Post-Processing</a>.
-This includes parsing model predictions to executable pyautogui codes.
-#### Coordinates processing
+#### Installation
+```bash
+pip install ui-tars
+# or
+uv pip install ui-tars
+```
+#### Usage
+```python
+from ui_tars.action_parser import parse_action_to_structure_output, parsing_response_to_pyautogui_code
+
+response = "Thought: Click the button\nAction: click(start_box='(100,200)')"
+original_image_width, original_image_height = 1920, 1080
+parsed_dict = parse_action_to_structure_output(
+    response,
+    factor=1000,
+    origin_resized_height=original_image_height,
+    origin_resized_width=original_image_width,
+    model_type="qwen25vl"
+)
+print(parsed_dict)
+parsed_pyautogui_code = parsing_response_to_pyautogui_code(
+    responses=parsed_dict,
+    image_height=original_image_height,
+    image_width=original_image_width
+)
+print(parsed_pyautogui_code)
+```
+##### FYI: Coordinates visualization
 To help you better understand the coordinate processing, we also provide a <a href="README_coordinates.md">guide</a> for coordinates processing visualization.
 
 ## Prompt Usage Guide
 
-To accommodate different device environments and task complexities, the following three prompt templates in <a href="codes/prompts.py">codes/prompts.py</a>. are designed to guide GUI agents in generating appropriate actions. Choose the template that best fits your use case:
+To accommodate different device environments and task complexities, the following three prompt templates in <a href="codes/ui_tars/prompt.py">codes/ui_tars/prompt.py</a>. are designed to guide GUI agents in generating appropriate actions. Choose the template that best fits your use case:
 
 ### 🖥️ `COMPUTER_USE`
 
diff --git a/codes/README.md b/codes/README.md
@@ -1,14 +1,14 @@
 # ui-tars
 
-A python package for parsing LLM-generated GUI action instructions, automatically generating pyautogui scripts, and supporting coordinate conversion and smart image resizing.
+A python package for parsing VLM-generated GUI action instructions into executable pyautogui codes.
 
 ---
 
 ## Introduction
 
-`ui-tars` is a Python package for parsing LLM-generated GUI action instructions, automatically generating pyautogui scripts, and supporting coordinate conversion and smart image resizing.
+`ui-tars` is a Python package for parsing VLM-generated GUI action instructions, automatically generating pyautogui scripts, and supporting coordinate conversion and smart image resizing.
 
-- Supports multiple LLM output formats (e.g., Qwen, Doubao)
+- Supports multiple VLM output formats (e.g., Qwen-VL, Seed-VL)
 - Automatically handles coordinate scaling and format conversion
 - One-click generation of pyautogui automation scripts
 
@@ -24,12 +24,12 @@ pip install ui-tars
 uv pip install ui-tars
 ```
 
-### Parse LLM output into structured actions
+### Parse output into structured actions
 
 ```python
-from ui_tars.action_parser import parse_action_to_structure_output
+from ui_tars.action_parser import parse_action_to_structure_output, parsing_response_to_pyautogui_code
 
-response = "Thought: Click the button\nAction: click(start_box='(0.1,0.2,0.1,0.2)')"
+response = "Thought: Click the button\nAction: click(point='<point>200 300</point>')"
 original_image_width, original_image_height = 1920, 1080
 parsed_dict = parse_action_to_structure_output(
     response,
@@ -39,6 +39,12 @@ parsed_dict = parse_action_to_structure_output(
     model_type="doubao"
 )
 print(parsed_dict)
+parsed_pyautogui_code = parsing_response_to_pyautogui_code(
+    responses=parsed_dict,
+    image_height=original_image_height,
+    image_width=original_image_width
+)
+print(parsed_pyautogui_code)
 ```
 
 ### Generate pyautogui automation script
@@ -90,10 +96,10 @@ def parse_action_to_structure_output(
 ```
 
 **Description:**
-Parses LLM output action instructions into structured dictionaries, automatically handling coordinate scaling and box/point format conversion.
+Parses output action instructions into structured dictionaries, automatically handling coordinate scaling and box/point format conversion.
 
 **Parameters:**
-- `text`: The LLM output string
+- `text`: The output string
 - `factor`: Scaling factor
 - `origin_resized_height`/`origin_resized_width`: Original image height/width
 - `model_type`: Model type (e.g., "qwen25vl", "doubao")
diff --git a/codes/tests/action_parser_test.py b/codes/tests/action_parser_test.py
@@ -14,15 +14,15 @@
 
 class TestActionParser(unittest.TestCase):
     def test_parse_action(self):
-        action_str = "click(start_box='(10,20,30,40)')"
+        action_str = "click(point='<point>200 300</point>')"
         result = parse_action(action_str)
         self.assertEqual(result['function'], 'click')
-        self.assertEqual(result['args']['start_box'], '(10,20,30,40)')
+        self.assertEqual(result['args']['point'], '<point>200 300</point>')
 
     def test_parse_action_to_structure_output(self):
-        text = "Thought: test\nAction: click(start_box='(10,20,30,40)')"
+        text = "Thought: test\nAction: click(point='<point>200 300</point>')"
         actions = parse_action_to_structure_output(
-            text, factor=28, origin_resized_height=224, origin_resized_width=224
+            text, factor=1000, origin_resized_height=224, origin_resized_width=224
         )
         self.assertEqual(actions[0]['action_type'], 'click')
         self.assertIn('start_box', actions[0]['action_inputs'])