AlibabaResearch
diff --git a/‎MMLatentAction/README.md‎
Lines changed: 97 additions & 0 deletions b/‎MMLatentAction/README.md‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎MMLatentAction/accelerate_configs/multi_gpu_4gpu.yaml‎
Lines changed: 16 additions & 0 deletions b/‎MMLatentAction/accelerate_configs/multi_gpu_4gpu.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎MMLatentAction/data/image_text_posttrain/MMRole_data_process.py‎
Lines changed: 113 additions & 0 deletions b/‎MMLatentAction/data/image_text_posttrain/MMRole_data_process.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎MMLatentAction/data/image_text_posttrain/YongqiLi/PCogAlignBench/version_v4/api_utils.py‎
Lines changed: 19 additions & 0 deletions b/‎MMLatentAction/data/image_text_posttrain/YongqiLi/PCogAlignBench/version_v4/api_utils.py‎
Lines changed: 19 additions & 0 deletions
@@ -0,0 +1,97 @@
+
+# Code for "Controlling Multimodal Conversational Agents with Coverage-Enhanced Latent Actions"
+
+This repository contains the official implementation for reproducing the experiments in our paper.
+
+
+## 🛠️ Setup Instructions
+
+### 0.1 Environment
+
+- Python 3.10 is required.
+- Install dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+
+### 0.2 Base Model
+
+- Download **Qwen2.5-VL-3B-Instruct** from [Hugging Face](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct).
+- Place it in a directory **outside** this project (e.g., `../llm_path/Qwen/Qwen2.5-VL-3B-Instruct`), so the full path is:
+  ```
+  ../llm_path/Qwen/Qwen2.5-VL-3B-Instruct/
+  ```
+
+
+### 0.3 Data
+
+We provide related scripts for downloading and processing required datasets in the `./data` folder.
+
+
+---
+
+## Part 1: Latent Action Space Learning
+
+### Run Pretraining
+
+```bash
+bash pretrain.sh
+```
+
+---
+
+## Part 2: Latent Action Reinforcement Learning  
+(Example: **MMRole**)
+
+### 📌 Preliminary Setup
+
+Before running RL, configure API access for:
+
+| Component | Location | Task |
+|---------|----------|------|
+| Reward Model (RM) | `eval_results/api_utils.py` | Fill in your API key / endpoint for reward scoring |
+| LLM-as-a-Judge (final eval) | `sampling_results/api_utils.py` | Configure judge model|
+
+---
+
+### 2.1 Training
+
+Run RL on **MMRole**:
+
+```bash
+bash run_MMRole_RL.sh
+```
+
+This script:
+- Loads the pretrained `PolicyActionVLM` from Part 1.
+- Optimize the latent action policy via RL.
+- Generates evaluation results and saved to `sampling_results/*.json`.
+
+---
+
+### 2.2 Evaluation
+
+Run automatic evaluation using LLM-as-a-Judge:
+
+```bash
+cd sampling_results
+python MMRole_Eval.py
+```
+
+
+
+---
+**Reference**
+```bibtex
+@misc{li-2026-controlling,
+  title         = {Controlling Multimodal Conversational Agents with Coverage-Enhanced Latent Actions},
+  author        = {Yongqi Li and Hao Lang and Tieyun Qian and Yongbin Li},
+  year          = {2026},
+  eprint        = {2601.07516},
+  archivePrefix = {arXiv},
+  primaryClass  = {cs.CL},
+  url           = {https://arxiv.org/abs/2601.07516}
+}
+```
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: 0,1,2,3
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'fp16'
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
@@ -0,0 +1,113 @@
+import os
+import json
+import glob
+from tqdm import tqdm
+
+img_folder = "YanqiDai/MMRole_dataset/images"
+
+
+# TODO: fist ID, then OOD
+
+profile_folder = "YanqiDai/MMRole_dataset/profiles/in-distribution/detailed_profiles"
+input_dir = "YanqiDai/MMRole_dataset/dialogues/in-distribution/comment"
+tag = "YanqiDai/MMRole_dataset"
+saved_conv_dir = f"{tag}"
+saved_conv_path = f"{saved_conv_dir}/conversations-train-comment.json"
+json_files = glob.glob(os.path.join(input_dir, "*.json"))
+
+
+
+# profile_folder = "YanqiDai/MMRole_dataset/profiles/out-of-distribution/detailed_profiles"
+# input_json = "YanqiDai/MMRole_dataset/dialogues/out-of-distribution/comment.json"
+# tag = "YanqiDai/MMRole_dataset"
+# saved_conv_dir = f"{tag}"
+# saved_conv_path = f"{saved_conv_dir}/conversations-OODtest-comment.json"
+# json_files = [input_json]
+
+
+os.makedirs(saved_conv_dir, exist_ok=True)
+
+# 获取所有 JSON 文件
+print(json_files)
+
+all_conversations = []
+
+for file_path in tqdm(json_files, desc="Processing JSON files"):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    for instance in data:
+        original_id = instance["id"]
+        image_path = instance["image"]  # 保持原始 image 路径不变
+
+        # 构造完整的图像路径
+        full_img_path = os.path.join(img_folder, image_path)
+
+        # 检查图像是否存在
+        if not os.path.exists(full_img_path):
+            print(full_img_path)
+            continue  # 跳过不存在图像的样本
+
+        new_convs = []
+        original_roles = []  # 新增：记录每轮原始角色
+        assistant_responses = []
+
+        conversations=instance["conversations"]
+        if len(conversations)==0:
+            continue
+
+        character_role= None
+        for turn in conversations:
+            try:
+                orig_role = turn["role"]
+            except:
+                orig_role = turn["from"]
+
+            original_roles.append(orig_role)
+
+            if orig_role == "human":
+                new_role = "user"
+            else:
+                new_role = "assistant"
+                assistant_responses.append(turn["value"].strip())
+                character_role = orig_role
+
+            new_convs.append({
+                "role": new_role,
+                "content": turn["value"]
+            })
+
+        full_text = " ".join(assistant_responses) + " "
+
+        character_profile=None
+        if character_role is not None:
+            _character_role=character_role.replace(" ","_")
+            profile_path = os.path.join(profile_folder, f"{_character_role}.json")
+            try:
+                with open(profile_path, "r", encoding="utf-8") as f:
+                    character_profile = json.load(f)
+
+                new_entry = {
+                    "id": original_id,
+                    "image_path": image_path,
+                    "conversations": new_convs,
+                    "original_roles": original_roles,  # 新增字段
+                    "character_role": character_role,
+                    "character_profile": {
+                        character_role: character_profile,
+                    },
+                    "text": full_text,
+                }
+
+                all_conversations.append(new_entry)
+            except (FileNotFoundError, json.JSONDecodeError, OSError) as e:
+                print(f"Warning: Failed to load character profile from {profile_path}: {e}")
+                character_profile = None  # 显式保留 None 或根据需求设默认值
+
+
+
+
+# 保存为单个 JSON 文件
+with open(saved_conv_path, 'w', encoding='utf-8') as out_f:
+    json.dump(all_conversations, out_f, ensure_ascii=False, indent=2)
+
+print(f"✅ Saved {len(all_conversations)} conversations to {saved_conv_path}")
@@ -0,0 +1,19 @@
+
+def robust_API_response(
+        model_engine,
+        system_prompt,
+        user_prompt,
+        flag_web_search=False,
+        temperature=0.2,
+        require_json=True
+):
+    messages = [
+        {'role': 'system', 'content': system_prompt},
+        {'role': 'user', 'content': user_prompt},
+    ]
+    return_response=None
+
+    
+    return return_response
+
+