MacPaw · mshamrai · Jul 21, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,27 @@
+MIT License
+Copyright (c) 2025, Anonymous
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the “Software”), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+---
+This project includes and builds upon the BLIP model developed by Salesforce.com, Inc., which is licensed under the BSD 3-Clause License:
+BSD 3-Clause License
+Copyright (c) 2022, Salesforce.com, Inc.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of Salesforce.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -1,2 +1,134 @@
+[![MacPaw Research](https://pbs.twimg.com/profile_banners/3993798502/1720615716/1500x500)](https://research.macpaw.com)
+
 # Screen2AX
-Repository for Screen2AX paper
+
+A research-driven project for generating accessibility of macOS applications using computer vision and deep learning. Read more about the project in our [paper]().
+
+---
+
+## 📁 Datasets
+
+- [Screen2AX-Tree](https://huggingface.co/datasets/macpaw-research/Screen2AX-Tree)
+- [Screen2AX-Element](https://huggingface.co/datasets/macpaw-research/Screen2AX-Element)
+- [Screen2AX-Group](https://huggingface.co/datasets/macpaw-research/Screen2AX-Group)
+- [Screen2AX-Task](https://huggingface.co/datasets/macpaw-research/Screen2AX-Task)
+
+## 🤖 Models
+
+- [YOLOv11l — UI Elements Detection](https://huggingface.co/macpaw-research/yolov11l-ui-elements-detection)  
+- [BLIP — UI Elements Captioning](https://huggingface.co/macpaw-research/blip-icon-captioning)
+- [YOLOv11l — UI Groups Detection](https://huggingface.co/macpaw-research/yolov11l-ui-groups-detection)  
+
+---
+
+## 🛠 Requirements
+
+- macOS  
+- Python (recommended ≥ 3.11)
+- Conda  
+- Pip  
+
+---
+
+## ⚙️ Installation
+
+Create and activate the project environment:
+
+```bash
+conda create -n screen2ax python=3.11
+conda activate screen2ax
+pip install -r requirements.txt
+```
+
+## 🚀 Usage
+
+> ⚠️ The first run may take longer due to model downloads and initial setup.
+
+### Accessibility generation
+Run the accessibility generation script:
+
+```bash
+python -m hierarchy_dl.hierarchy --help
+```
+#### Available Options
+
+```
+usage: hierarchy.py [-h] [--image IMAGE] [--save] [--filename FILENAME] [--save_dir SAVE_DIR] [--flat]
+
+options:
+  -h, --help           show this help message and exit
+  --image IMAGE        Path to the image
+  --save               Save the result
+  --filename FILENAME  Filename to save the result
+  --save_dir SAVE_DIR  Directory to save the result. Default is './results/'
+  --flat               Generate flat hierarchy (no groups)
+```
+
+##### Example
+Run the accessibility generation script on a screenshot of the Spotify app:
+
+```bash
+python -m hierarchy_dl.hierarchy --image ./screenshots/spotify.png --save --filename spotify.json
+```
+
+This will generate a JSON file with the accessibility of the app in the results folder.
+
+### Screen Reader 
+Run the screen reader:
+
+```bash
+python -m screen_reader.screen_reader --help
+```
+
+#### Available Options
+
+```
+usage: screen_reader.py [-h] [-b BUNDLE_ID] [-n NAME] [-dw] [-dh] [-r RATE] [-v VOICE] [-sa] [-sk SKIP_GROUPS]
+
+options:
+  -h, --help                    show this help message and exit
+  -b, --bundle_id BUNDLE_ID     Bundle ID of the target application
+  -n, --name NAME               Name of the target application (alternative to bundle_id)
+  -dw, --deactivate_welcome     Skip the "Welcome to the ScreenReader." message
+  -dh, --deactivate_help        Skip reading the help message on startup
+  -r, --rate RATE               Set speech rate for macOS `say` command (default: 190)
+  -v, --voice VOICE             Set voice for macOS `say` command (see `say -v "?" | grep en`)
+  -sa, --system_accessibility   Use macOS system accessibility data instead of vision-generated
+  -sk, --skip-groups N          Skip groups with fewer than N children (default: 5)
+```
+
+##### Example
+
+Run the screen reader for the Spotify app:
+```bash
+python -m screen_reader.screen_reader --name Spotify
+```
+
+## 📜 License
+### 🔍 YOLO Models
+The YOLO models used for UI elements and UI groups detection are licensed under the GNU Affero General Public License (AGPL). This is inherited from the original YOLO model licensing.
+
+### 🧠 BLIP Model
+The BLIP model for captioning UI elements is provided under the MIT License. 
+
+### 📂 Datasets
+All datasets (Screen2AX-Tree, Screen2AX-Element, Screen2AX-Group, Screen2AX-Task) are released under the Apache 2.0 license.
+
+### 💻 Codebase
+All source code in this repository is licensed under the MIT License. See the [LICENSE](LICENSE) file for full terms and conditions.
+
+## 📚 Citation
+If you use this code in your research, please cite our paper:
+
+```bibtex
+...
+```
+
+## 🙌 Acknowledgements
+We would like to express our deepest gratitude to the Armed Forces of Ukraine. Your courage and unwavering defense of our country make it possible for us to live, work, and create in freedom. This work would not be possible without your sacrifice. Thank you.
+
+## MacPaw Research
+
+Visit our site to learn more 😉
+
+https://research.macpaw.com
diff --git a/hierarchy_dl/application.py b/hierarchy_dl/application.py
@@ -0,0 +1,82 @@
+import time
+import threading
+
+import tkinter as tk
+
+from hierarchy import generate_hierarchy
+from screen_reader.screenshot import screenshot_app, open_app_in_foreground
+
+
+run = True
+thread = None
+
+
+def start_action():
+    bundle_id = entry.get()
+    open_app_in_foreground(bundle_id, wait_time=2)
+
+    global run
+    run = True
+
+    i = 0
+    while run:
+        try:
+            start = time.time()
+            open_app_in_foreground(bundle_id, wait_time=0.25)
+            screen_path = screenshot_app(bundle_id, f"./screenshots/")[0]
+
+            tree = generate_hierarchy(screen_path, save=True, save_dir=f"./result/{bundle_id}/")
+
+            end = time.time()
+
+            i += 1
+            print(f"Frame #{i}, time taken: {end - start}")
+
+        except Exception as e:
+            print(f"Error: {e}")
+            break
+
+
+def stop_action():
+    global run, thread
+    run = False
+    print(f"Stopping process")
+
+    if thread:
+        thread.join()
+
+    print(f"Thread has stopped")
+
+
+def start_thread():
+    global thread
+    thread = threading.Thread(target=start_action, daemon=True)
+    thread.start()
+
+
+if __name__ == "__main__":
+    # Create main window
+    root = tk.Tk()
+    root.title("Bundle ID Manager")
+    root.geometry("300x200")
+
+    # Create input field
+    label = tk.Label(root, text="bundle_id:")
+    label.pack(pady=5)
+
+    entry = tk.Entry(root)
+    entry.pack(pady=5)
+
+    # Copyable text with suggestion
+    suggestion = tk.Label(root, text="osascript -e 'id of app \"Spotify\"' \n e.g. com.spotify.client")
+    suggestion.pack(pady=5)
+
+    # Create buttons
+    start_button = tk.Button(root, text="Start", command=start_thread)
+    start_button.pack(pady=5)
+
+    stop_button = tk.Button(root, text="Stop", command=stop_action)
+    stop_button.pack(pady=5)
+
+    # Run application
+    root.mainloop()
diff --git a/hierarchy_dl/blip.py b/hierarchy_dl/blip.py
@@ -0,0 +1,25 @@
+import torch
+from PIL import Image
+from transformers import BlipProcessor, BlipForConditionalGeneration
+
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+
+print(f"Using device: {device}")
+
+cache_dir = "./.models"
+
+model_path = "macpaw-research/blip-icon-captioning"
+processor = BlipProcessor.from_pretrained(model_path, cache_dir=cache_dir)
+model = BlipForConditionalGeneration.from_pretrained(model_path, cache_dir=cache_dir).to(device)
+model.eval()
+
+@torch.no_grad()
+def generate_captions(images: list[Image.Image]) -> list[str]:
+    inputs = processor(images, return_tensors="pt").to(device)
+    outputs = model.generate(**inputs, max_new_tokens=25)
+    return processor.batch_decode(outputs, skip_special_tokens=True)
diff --git a/hierarchy_dl/hierarchy.py b/hierarchy_dl/hierarchy.py
@@ -0,0 +1,120 @@
+import os
+import json
+import time
+from os import path
+from typing import Optional
+
+import numpy as np
+from PIL import Image
+from ocrmac import ocrmac
+from ultralytics import YOLO
+
+from hierarchy_dl.utils import *
+
+from huggingface_hub import hf_hub_download
+cache_dir = "./.models"
+
+ui_elements_model_path = hf_hub_download(
+    repo_id="macpaw-research/yolov11l-ui-elements-detection",
+    filename="ui-elements-detection.pt",
+    cache_dir=cache_dir
+)
+
+ui_groups_model_path = hf_hub_download(
+    repo_id="macpaw-research/yolov11l-ui-groups-detection",
+    filename="ui-groups-detection.pt",
+    cache_dir=cache_dir
+)
+
+ui_elements_model = YOLO(ui_elements_model_path)
+ui_groups_model = YOLO(ui_groups_model_path)
+
+
+def generate_hierarchy(
+    img: str | Image.Image | np.ndarray,
+    save_dir: str = "./results/", 
+    save: bool = False, 
+    filename: Optional[str] = None,
+    flat: bool = False
+) -> UIElement:
+    """
+        Generate UI hierarchy from an image
+    """
+    # load image
+    if isinstance(img, str):
+        img_pil = Image.open(img)
+
+    if isinstance(img, np.ndarray):
+        img_pil = Image.fromarray(img)
+
+    if isinstance(img, Image.Image):
+        img_pil = img
+
+    width, height = img_pil.size
+
+    # detect ui elements
+    ui_elements = ui_elements_model(img_pil, verbose=False)[0].boxes
+    ui_elements = [UIElement(box, cls) for box, cls in zip(ui_elements.xyxy, ui_elements.cls)]
+
+    # detect ui groups
+    ui_groups = ui_groups_model(img_pil, conf=0.5, verbose=False)[0].boxes
+    ui_groups = [UIElement(box, "Group") for box in ui_groups.xyxy]
+
+    # ocr
+    annotations = ocrmac.OCR(img_pil, language_preference=['en-US']).recognize(px=True)
+    annotations = [UIElement(box, "Text", value=val) for val, _, box in annotations]
+
+    # merge texts and elements
+    annotations = group_texts(annotations)
+    ui_elements = merge_text_and_elements(ui_elements, annotations, iou_threshold=0.2)
+
+    # icons
+    ui_elements = caption_buttons(ui_elements, img_pil, batch_size=16)
+
+    if not flat:
+        # build tree
+        tree = build_tree(ui_groups, ui_elements, (width, height), iou_threshold=0.0)
+        clean_tree(tree)
+
+        if len(tree.children) == 1:
+            tree = tree.children[0]
+    else:
+        ui_elements.sort(key=lambda x: x.box[0] ** 2 + x.box[1] ** 2)
+        tree = UIElement(
+            box=[0, 0, width, height],
+            cls="Group",
+            value="Screen"
+        )
+        tree.children = ui_elements
+
+    if save or filename:
+        os.makedirs(save_dir, exist_ok=True)
+
+        filename = f"{path.basename(img)}.json" if isinstance(img, str) and not filename else filename
+        filename = filename or f"{time.time()}.json"
+
+        full_path = path.join(save_dir, filename)
+
+        with open(full_path, "w", encoding='utf-8') as f:
+            json.dump(tree.to_dict(), f, indent=4)
+
+    return tree
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image", type=str, default="./screen.png", help="Path to the image")
+    parser.add_argument("--save", action="store_true", help="Save the result")
+    parser.add_argument("--filename", type=str, default=None, help="Filename to save the result")
+    parser.add_argument("--save_dir", type=str, default="./results/", help="Directory to save the result. Default is './results/'")
+    parser.add_argument("--flat", action="store_true", help="Generate flat hierarchy (no groups)")
+    args = parser.parse_args()
+
+    image = args.image
+    save_dir = args.save_dir
+    save = args.save
+    filename = args.filename
+    flat = args.flat
+
+    tree = generate_hierarchy(image, save_dir, save, filename, flat)